diff --git a/python/dp_accounting/dp_accounting/__init__.py b/python/dp_accounting/dp_accounting/__init__.py index 1d28d0c6..265d28ad 100644 --- a/python/dp_accounting/dp_accounting/__init__.py +++ b/python/dp_accounting/dp_accounting/__init__.py @@ -32,6 +32,7 @@ from dp_accounting.dp_event import SampledWithReplacementDpEvent from dp_accounting.dp_event import SelfComposedDpEvent from dp_accounting.dp_event import SingleEpochTreeAggregationDpEvent +from dp_accounting.dp_event import RandomAllocationDpEvent from dp_accounting.dp_event import TruncatedSubsampledGaussianDpEvent from dp_accounting.dp_event import UnsupportedDpEvent from dp_accounting.dp_event import ZCDpEvent diff --git a/python/dp_accounting/dp_accounting/dp_event.py b/python/dp_accounting/dp_accounting/dp_event.py index 019c3735..997c93d2 100644 --- a/python/dp_accounting/dp_accounting/dp_event.py +++ b/python/dp_accounting/dp_accounting/dp_event.py @@ -494,3 +494,23 @@ class TruncatedSubsampledGaussianDpEvent(DpEvent): sampling_probability: float truncated_batch_size: int noise_multiplier: float + + +@attr.s(frozen=True, slots=True, auto_attribs=True) +class RandomAllocationDpEvent(DpEvent): + """Represents the random-allocation mechanism. + + In this event, each element in the dataset is independently allocated to k + uniformly sampled calls to the mechanism out of t total calls. + + Currently only GaussianDpEvent is supported as the inner event. + + Attributes: + event: The DpEvent for each call to the mechanism (e.g., GaussianDpEvent). + k: Number of calls each element participates in. + t: Total number of calls to the mechanism. + """ + + event: DpEvent + k: int + t: int diff --git a/python/dp_accounting/dp_accounting/dp_event_test.py b/python/dp_accounting/dp_accounting/dp_event_test.py index 7e208676..1f8e6e63 100644 --- a/python/dp_accounting/dp_accounting/dp_event_test.py +++ b/python/dp_accounting/dp_accounting/dp_event_test.py @@ -123,6 +123,14 @@ class DpEventTest(parameterized.TestCase): 1.0, ), ), + ( + 'random_allocation', + dp_event.RandomAllocationDpEvent( + event=dp_event.GaussianDpEvent(2.0), + k=10, + t=100, + ), + ), ) def test_to_from_named_tuple(self, event): named_tuple = event.to_named_tuple() @@ -134,6 +142,12 @@ def test_to_from_named_tuple(self, event): assert_not_contains_named_tuples(reconstructed) self.assertEqual(event, reconstructed) + def test_random_allocation_event_has_no_discretization_override(self): + event = dp_event.RandomAllocationDpEvent( + event=dp_event.GaussianDpEvent(2.0), k=10, t=100 + ) + self.assertFalse(hasattr(event, 'loss_discretization')) + if __name__ == '__main__': absltest.main() diff --git a/python/dp_accounting/dp_accounting/pld/pld_privacy_accountant.py b/python/dp_accounting/dp_accounting/pld/pld_privacy_accountant.py index d2acf7d1..8927f714 100644 --- a/python/dp_accounting/dp_accounting/pld/pld_privacy_accountant.py +++ b/python/dp_accounting/dp_accounting/pld/pld_privacy_accountant.py @@ -20,6 +20,7 @@ from dp_accounting import privacy_accountant from dp_accounting.pld import common from dp_accounting.pld import privacy_loss_distribution +from dp_accounting.pld import random_allocation as _random_allocation NeighborRel = privacy_accountant.NeighboringRelation CompositionErrorDetails = ( @@ -263,6 +264,31 @@ def _maybe_compose(self, event: dp_event.DpEvent, count: int, ) self._pld = self._pld.compose(truncated_subsampled_gaussian_pld) return None + elif isinstance(event, dp_event.RandomAllocationDpEvent): + ra_event: dp_event.RandomAllocationDpEvent = event + if not isinstance(ra_event.event, dp_event.GaussianDpEvent): + return CompositionErrorDetails( + invalid_event=event, + error_message=( + 'Subevent of `RandomAllocationDpEvent` must be ' + f'`GaussianDpEvent`. Found {ra_event.event}.' + ), + ) + if do_compose: + if ra_event.event.noise_multiplier == 0: + self._contains_non_dp_event = True + else: + params = _random_allocation.PrivacyParams( + sigma=ra_event.event.noise_multiplier, + num_steps=ra_event.t, + num_selected=ra_event.k, + ) + config = _random_allocation.AllocationSchemeConfig( + loss_discretization=self._value_discretization_interval, + ) + alloc_pld = _random_allocation.gaussian_allocation_pld(params, config) + self._pld = self._pld.compose(alloc_pld.self_compose(count)) + return None else: # Unsupported event (including `UnsupportedDpEvent`). return CompositionErrorDetails( diff --git a/python/dp_accounting/dp_accounting/pld/privacy_loss_distribution_basic_example.py b/python/dp_accounting/dp_accounting/pld/privacy_loss_distribution_basic_example.py index 47273078..b81e9c62 100644 --- a/python/dp_accounting/dp_accounting/pld/privacy_loss_distribution_basic_example.py +++ b/python/dp_accounting/dp_accounting/pld/privacy_loss_distribution_basic_example.py @@ -16,7 +16,10 @@ from absl import app +from dp_accounting import GaussianDpEvent +from dp_accounting import RandomAllocationDpEvent from dp_accounting.pld import privacy_loss_distribution +from dp_accounting.pld.pld_privacy_accountant import PLDAccountant def main(argv): @@ -67,6 +70,26 @@ def main(argv): f'{standard_deviation} is ({epsilon}, {delta})-DP.' ) + # The PLDAccountant also supports the random allocation mechanism. Each + # element is independently assigned to k out of t calls to the mechanism. + noise_multiplier = 2.0 + k = 10 + t = 100 + accountant = PLDAccountant() + accountant.compose( + RandomAllocationDpEvent( + event=GaussianDpEvent(noise_multiplier=noise_multiplier), + k=k, + t=t, + ) + ) + target_delta = 1e-6 + epsilon = accountant.get_epsilon(target_delta) + print( + f'Random allocation with Gaussian noise (noise_multiplier={noise_multiplier},' + f' k={k}, t={t}) satisfies ({epsilon:.4f}, {target_delta})-DP.' + ) + if __name__ == '__main__': app.run(main) diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/__init__.py b/python/dp_accounting/dp_accounting/pld/random_allocation/__init__.py new file mode 100644 index 00000000..f7613242 --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/__init__.py @@ -0,0 +1,29 @@ +"""Public entry points for random-allocation privacy accounting.""" + +from .random_allocation_distributions import PLDRealization +from .random_allocation_api import ( + gaussian_allocation_pld, + general_allocation_pld, +) +from .random_allocation_types import ( + DEFAULT_LOSS_DISCRETIZATION, + DEFAULT_TAIL_TRUNCATION, + AllocationSchemeConfig, + BoundType, + Direction, + PrivacyParams, + SpacingType, +) + +__all__ = [ + "PLDRealization", + "AllocationSchemeConfig", + "BoundType", + "DEFAULT_LOSS_DISCRETIZATION", + "DEFAULT_TAIL_TRUNCATION", + "Direction", + "PrivacyParams", + "SpacingType", + "gaussian_allocation_pld", + "general_allocation_pld", +] diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_api.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_api.py new file mode 100644 index 00000000..400291ae --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_api.py @@ -0,0 +1,137 @@ +"""Public API surface for random-allocation PLD construction.""" + +from __future__ import annotations + +from functools import partial + +from dp_accounting.pld import privacy_loss_distribution + +from .random_allocation_distributions import PLDRealization +from .random_allocation_core import ( + allocation_full_pld, + gaussian_allocation_pld_core, + geometric_allocation_pld_base_add, + geometric_allocation_pld_base_remove, + realization_add_base_distribution, + realization_remove_base_distributions, +) +from .random_allocation_types import ( + AllocationSchemeConfig, + BoundType, + Direction, + PrivacyParams, +) +from .random_allocation_utils import ( + validate_allocation_params, + validate_allocation_scheme_config, + validate_bound_type, + validate_privacy_params, +) + + +def gaussian_allocation_pld( + params: PrivacyParams, + config: AllocationSchemeConfig, + bound_type: BoundType = BoundType.DOMINATES, +) -> privacy_loss_distribution.PrivacyLossDistribution: + """Compute upper / lower PLD for random-allocation with the Gaussian mechanism. + + Args: + params: Privacy parameters describing noise scale, number of steps, + and optional delta/epsilon query target. + config: Discretization and convolution configuration. + bound_type: Whether to compute a dominating or dominated discretized bound. + + Returns: + A ``dp_accounting`` ``PrivacyLossDistribution`` for both privacy directions. + + """ + # Input validation + validate_privacy_params(params) + validate_allocation_scheme_config(config) + validate_bound_type(bound_type) + + compute_base_pld_remove = partial( + gaussian_allocation_pld_core, + direction=Direction.REMOVE, + sigma=params.sigma, + ) + compute_base_pld_add = partial( + gaussian_allocation_pld_core, + direction=Direction.ADD, + sigma=params.sigma, + ) + return allocation_full_pld( + compute_base_pld_remove=compute_base_pld_remove, + compute_base_pld_add=compute_base_pld_add, + num_steps=params.num_steps, + num_selected=params.num_selected, + num_epochs=params.num_epochs, + loss_discretization=config.loss_discretization, + tail_truncation=config.tail_truncation, + bound_type=bound_type, + ) + + +def general_allocation_pld( + num_steps: int, + num_selected: int, + num_epochs: int, + remove_realization: PLDRealization, + add_realization: PLDRealization, + config: AllocationSchemeConfig, + bound_type: BoundType = BoundType.DOMINATES, +) -> privacy_loss_distribution.PrivacyLossDistribution: + """Build a random-allocation PLD from explicit PLD realizations. + + Args: + num_steps: Total number of random-allocation steps. + num_selected: Number of selections per epoch. + num_epochs: Number of epochs. + remove_realization: Explicit remove-direction PLD realization. + add_realization: Explicit add-direction PLD realization. + config: Discretization and convolution configuration. + bound_type: Whether to compute a dominating or dominated discretized bound. + + Returns: + A ``dp_accounting`` ``PrivacyLossDistribution`` for the composed realization. + + Notes: + The delivery package always uses the geometric convolution path. + + """ + # Input validation + validate_allocation_params(num_steps, num_selected, num_epochs) + if not isinstance(remove_realization, PLDRealization): + raise TypeError( + f"remove_realization must be PLDRealization, got {type(remove_realization)}" + ) + if not isinstance(add_realization, PLDRealization): + raise TypeError(f"add_realization must be PLDRealization, got {type(add_realization)}") + validate_allocation_scheme_config(config) + validate_bound_type(bound_type) + + compute_base_pld_remove = partial( + geometric_allocation_pld_base_remove, + base_distributions_creation=partial( + realization_remove_base_distributions, + realization=remove_realization, + ), + ) + compute_base_pld_add = partial( + geometric_allocation_pld_base_add, + base_distributions_creation=partial( + realization_add_base_distribution, + realization=add_realization, + ), + ) + return allocation_full_pld( + compute_base_pld_remove=compute_base_pld_remove, + compute_base_pld_add=compute_base_pld_add, + num_steps=num_steps, + num_selected=num_selected, + num_epochs=num_epochs, + loss_discretization=config.loss_discretization, + tail_truncation=config.tail_truncation, + bound_type=bound_type, + ) diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_api_test.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_api_test.py new file mode 100644 index 00000000..e95a92e4 --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_api_test.py @@ -0,0 +1,68 @@ +"""Delivery API tests: only PLD builders are public.""" + +from __future__ import annotations + +import numpy as np +import pytest +from dp_accounting.pld import privacy_loss_distribution + +from .random_allocation_api import gaussian_allocation_pld, general_allocation_pld +from .random_allocation_distributions import PLDRealization +from .random_allocation_types import AllocationSchemeConfig, BoundType, PrivacyParams + + +def _simple_realization() -> PLDRealization: + return PLDRealization( + x_min=0.0, + step=1.0, + prob_arr=np.array([0.7, 0.3], dtype=np.float64), + p_max=0.0, + ) + + +def test_gaussian_allocation_pld_returns_dp_accounting_pld(): + params = PrivacyParams(sigma=2.0, num_steps=5, num_selected=1, num_epochs=1) + config = AllocationSchemeConfig(loss_discretization=0.05, tail_truncation=1e-6) + + pld = gaussian_allocation_pld(params=params, config=config, bound_type=BoundType.DOMINATES) + + assert isinstance(pld, privacy_loss_distribution.PrivacyLossDistribution) + + +def test_general_allocation_pld_returns_dp_accounting_pld(): + config = AllocationSchemeConfig(loss_discretization=0.05, tail_truncation=1e-6) + + pld = general_allocation_pld( + num_steps=5, + num_selected=1, + num_epochs=1, + remove_realization=_simple_realization(), + add_realization=_simple_realization(), + config=config, + bound_type=BoundType.DOMINATES, + ) + + assert isinstance(pld, privacy_loss_distribution.PrivacyLossDistribution) + + +def test_non_pld_exports_are_not_exposed(): + import importlib + + package = importlib.import_module(__package__) + + assert not hasattr(package, "gaussian_allocation_epsilon_range") + assert not hasattr(package, "gaussian_distribution") + assert not hasattr(package, "subsample_pld") + + +def test_general_allocation_rejects_non_realization_inputs(): + config = AllocationSchemeConfig() + with pytest.raises(TypeError, match="remove_realization must be PLDRealization"): + general_allocation_pld( + num_steps=2, + num_selected=1, + num_epochs=1, + remove_realization=object(), + add_realization=_simple_realization(), + config=config, + ) diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_convolution.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_convolution.py new file mode 100644 index 00000000..acf3db32 --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_convolution.py @@ -0,0 +1,657 @@ +"""FFT and geometric-grid convolution for privacy loss distributions.""" + +from __future__ import annotations + +import math +import os +import warnings + +import numpy as np +from dp_accounting.pld.common import compute_self_convolve_bounds +from numba import njit +from numpy.typing import NDArray +from scipy.fft import irfft, next_fast_len, rfft + +from .random_allocation_distributions import DenseDiscreteDist, Domain +from .random_allocation_distributions import enforce_mass_conservation, stable_isclose +from .random_allocation_types import BoundType, SpacingType +from .random_allocation_utils import ( + binary_self_convolve, + convolve_boundary_masses, + self_convolve_boundary_masses, + validate_bound_type, +) + +# Maximum bytes for a single FFT allocation (default 8 GB, override via MAX_FFT_BYTES env var) +MAX_FFT_BYTES = int(os.environ.get("MAX_FFT_BYTES", 8 * 1024**3)) + + +def fft_convolve( + *, + dist_1: DenseDiscreteDist, + dist_2: DenseDiscreteDist, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Convolve two linear-grid distributions via FFT.""" + if not ( + isinstance(dist_1, DenseDiscreteDist) and dist_1.spacing_type == SpacingType.LINEAR + ) or not (isinstance(dist_2, DenseDiscreteDist) and dist_2.spacing_type == SpacingType.LINEAR): + raise TypeError( + "fft_convolve requires linear DenseDiscreteDist inputs; " + f"got dist_1={type(dist_1).__name__} (spacing={dist_1.spacing_type}), " + f"dist_2={type(dist_2).__name__} (spacing={dist_2.spacing_type})" + ) + if dist_1.domain != dist_2.domain: + raise ValueError(f"Input domains must be identical, got {dist_1.domain} vs {dist_2.domain}") + if not np.any(dist_1.prob_arr) or not np.any(dist_2.prob_arr): + raise ValueError("FFT convolution requires nonzero finite mass in both inputs") + if not stable_isclose(a=dist_1.step, b=dist_2.step): + raise ValueError(f"Grid spacing must match: w1={dist_1.step:.12g} vs w2={dist_2.step:.12g}") + + width = dist_1.step + conv_x_min = dist_1.x_min + dist_2.x_min + + # --- Manual rfft/irfft with in-place multiply (saves one complex128 buffer) --- + conv_full_len = dist_1.prob_arr.size + dist_2.prob_arr.size - 1 + fft_size = next_fast_len(conv_full_len) + _check_fft_memory(fft_size, label="fft_convolve") + + # Capture ghost-mass bounds and normalization factors before FFT buffers are allocated + nz1 = np.nonzero(dist_1.prob_arr)[0] + nz2 = np.nonzero(dist_2.prob_arr)[0] + min_idx = int(nz1[0] + nz2[0]) + max_idx = int(nz1[-1] + nz2[-1]) + finite_prob_1 = math.fsum(map(float, dist_1.prob_arr)) + finite_prob_2 = math.fsum(map(float, dist_2.prob_arr)) + + # Self-squaring optimization: if both inputs are the same object, + # compute rfft once and square in-place (saves one complex buffer) + is_self_convolve = dist_1 is dist_2 or dist_1.prob_arr is dist_2.prob_arr + fft1 = rfft(dist_1.prob_arr, n=fft_size) + if is_self_convolve: + fft1 *= fft1 # in-place square + else: + fft2 = rfft(dist_2.prob_arr, n=fft_size) + fft1 *= fft2 # in-place multiply + del fft2 # free second complex buffer immediately + conv_full = irfft(fft1, n=fft_size, overwrite_x=True) + del fft1 # free complex buffer + conv_pmf = conv_full[:conv_full_len].copy() # copy needed portion + del conv_full # free full irfft output + + # Zero negative roundoff and ghost mass outside reachable support + conv_pmf[conv_pmf < 0] = 0.0 + conv_pmf[:min_idx] = 0.0 + max_idx_plus_one = max_idx + 1 + if max_idx_plus_one < conv_pmf.size: + conv_pmf[max_idx_plus_one:] = 0.0 + + current_finite_mass = math.fsum(map(float, conv_pmf)) + if current_finite_mass <= 0.0: + raise ValueError("FFT convolution produced zero finite mass") + # Renormalize finite mass before reattaching the analytically computed + # infinity masses. This corrects small drift from FFT arithmetic/clipping. + conv_pmf *= finite_prob_1 * finite_prob_2 / current_finite_mass + + expected_p_min, expected_p_max = convolve_boundary_masses( + dist_1.p_min, dist_1.p_max, dist_2.p_min, dist_2.p_max, dist_1.domain + ) + conv_pmf, p_min, p_max = enforce_mass_conservation( + prob_arr=conv_pmf, + expected_p_min=expected_p_min, + expected_p_max=expected_p_max, + bound_type=bound_type, + ) + + return DenseDiscreteDist( + x_min=conv_x_min, + step=width, + prob_arr=conv_pmf, + p_min=p_min, + p_max=p_max, + domain=dist_1.domain, + ).truncate_edges(tail_truncation, bound_type) + + +def fft_self_convolve( + *, + dist: DenseDiscreteDist, + T: int, + tail_truncation: float, + bound_type: BoundType, + use_direct: bool, +) -> DenseDiscreteDist: + """T-fold self-convolution via FFT with optional direct exponentiation path.""" + if not (isinstance(dist, DenseDiscreteDist) and dist.spacing_type == SpacingType.LINEAR): + raise TypeError("fft_self_convolve requires DenseDiscreteDist input") + + if use_direct: + try: + return _fft_self_convolve_direct( + dist=dist, + T=T, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + except MemoryError: + warnings.warn( + f"fft_self_convolve: direct method exceeded {MAX_FFT_BYTES / 1024**3:.0f} GB " + f"memory limit for T={T}, pmf_size={dist.prob_arr.size:,}. " + f"Falling back to binary self-convolution." + ) + + self_conv = binary_self_convolve( + dist=dist, + T=T, + tail_truncation=tail_truncation, + bound_type=bound_type, + convolve=fft_convolve, + ) + if not ( + isinstance(self_conv, DenseDiscreteDist) and self_conv.spacing_type == SpacingType.LINEAR + ): + raise TypeError( + f"Expected DenseDiscreteDist from FFT self-convolution, got {type(self_conv)}" + ) + return self_conv + + +def _fft_self_convolve_direct( + *, + dist: DenseDiscreteDist, + T: int, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + # Budget split: the input tail_truncation is divided into three equal thirds. + # _calc_fft_window_size: Chernoff-based window determines the one-sided tail + # cutoff (right-tail for DOMINATES, folded-back mass bound for IS_DOMINATED). + # explicit opposite-side trim: left_tail_ind for DOMINATES (zeroes left bins, + # pushes mass to p_max) / right_tail_ind for IS_DOMINATED (zeroes right bins, + # pushes mass to p_min). + # final truncate_edges: trims actual near-zero edge bins the Chernoff window + # conservatively included on the remaining untrimmed side, reducing output + # bin count without sacrificing accuracy. + # Total: 3 * (tail_truncation / 3) = tail_truncation + tail_truncation /= 3 + + finite_mass = math.fsum(map(float, dist.prob_arr)) + # The Chernoff window calculation expects a normalized finite PMF, so the + # tail target must be rescaled when some mass already sits at infinity. + normalized_pmf = dist.prob_arr / finite_mass + tail_truncation_rescaled = tail_truncation / finite_mass + + shift_left, window_size = _calc_fft_window_size( + pmf=normalized_pmf, num_convolutions=T, tail_truncation=tail_truncation_rescaled + ) + + fft_size = next_fast_len(max(window_size, dist.prob_arr.size)) + _check_fft_memory(fft_size, label=f"_fft_self_convolve_direct(T={T})") + fft_data = rfft(dist.prob_arr, n=fft_size) + fft_data **= T # in-place power: avoids allocating a second complex buffer + raw_conv = np.asarray(irfft(fft_data, n=fft_size, overwrite_x=True), dtype=np.float64) + del fft_data # free complex buffer + raw_conv[raw_conv < 0] = 0.0 + # ``shift_left`` is the left edge of the retained convolution window. Rolling aligns + # that window to index 0 so truncation logic can work in-place. + rolled_conv = np.roll(raw_conv, -shift_left) + + conv_p_min, conv_p_max = self_convolve_boundary_masses(dist, num_convolutions=T) + if bound_type == BoundType.DOMINATES: + # For an upper bound, any dropped left-tail mass is pushed to +inf. + cumsum = np.cumsum(rolled_conv) + left_tail_ind = int(np.searchsorted(cumsum, tail_truncation, side="right")) + shifted_mass = math.fsum(map(float, rolled_conv[:left_tail_ind])) + rolled_conv[:left_tail_ind] = 0.0 + right_tail_mass = math.fsum(map(float, rolled_conv[window_size:])) + conv_p_max += shifted_mass + right_tail_mass + elif bound_type == BoundType.IS_DOMINATED: + # For a lower bound, dropped right-tail mass moves to -inf, while any + # overflow beyond the retained FFT window is folded onto the last kept + # finite bin to preserve domination direction. + cumsum = np.cumsum(rolled_conv[::-1]) + right_tail_ind = ( + rolled_conv.size - 1 - int(np.searchsorted(cumsum, tail_truncation, side="right")) + ) + after_right_tail = right_tail_ind + 1 + shifted_mass = math.fsum(map(float, rolled_conv[after_right_tail:])) + rolled_conv[after_right_tail:] = 0.0 + conv_p_min += shifted_mass + + right_tail_mass = math.fsum(map(float, rolled_conv[window_size:])) + rolled_conv[min(window_size, right_tail_ind) - 1] += right_tail_mass + else: + raise ValueError(f"Unknown BoundType: {bound_type}") + + x_min = dist.x_min * T + shift_left * dist.step + pmf_conv = rolled_conv[:window_size] + pmf_conv, p_min_final, p_max_final = enforce_mass_conservation( + prob_arr=pmf_conv, + expected_p_min=conv_p_min, + expected_p_max=conv_p_max, + bound_type=bound_type, + ) + + return DenseDiscreteDist( + x_min=x_min, + step=dist.step, + prob_arr=pmf_conv, + p_min=p_min_final, + p_max=p_max_final, + domain=dist.domain, + ).truncate_edges(tail_truncation, bound_type) + + +def _calc_fft_window_size( + *, pmf: np.ndarray, num_convolutions: int, tail_truncation: float +) -> tuple[int, int]: + """Calculate FFT window bounds for ``num_convolutions`` self-convolutions with fallback.""" + # ``compute_self_convolve_bounds`` gives a Chernoff-style window [lower, upper] that + # should contain all but ``tail_truncation`` mass after ``num_convolutions`` convolutions. + lower_idx, upper_idx = compute_self_convolve_bounds(pmf, num_convolutions, tail_truncation) + window_size = upper_idx - lower_idx + 1 + + if not 0 < window_size < float("inf"): + lower_idx = 0 + n = len(pmf) + # Fallback to the exact full-support FFT length when the bound becomes + # numerically unusable for extreme truncation parameters. + window_size = num_convolutions * (n - 1) + 1 + warnings.warn( + "calc_fft_window_size: Chernoff bounds failed " + f"(tail_truncation={tail_truncation:.3e}, num_convolutions={num_convolutions}). " + f"Using fallback lower_idx=0, window_size={window_size:,} (n={n})." + ) + + return int(lower_idx), int(window_size) + + +def _check_fft_memory(fft_size: int, label: str = "FFT") -> None: + """Raise MemoryError if an FFT of this size would exceed the safety limit. + + rfft produces complex128 output (~16 bytes per element) and the input is + float64 (~8 bytes), so peak usage is roughly 24 * fft_size bytes. + """ + estimated_bytes = 24 * fft_size + if estimated_bytes > MAX_FFT_BYTES: + raise MemoryError( + f"{label}: estimated {estimated_bytes / 1024**3:.1f} GB for " + f"fft_size={fft_size:,} exceeds safety limit of " + f"{MAX_FFT_BYTES / 1024**3:.1f} GB. " + f"Reduce grid size, increase loss_discretization, or raise MAX_FFT_BYTES." + ) + +# Rounding tolerance for grid bin mapping — must stay at machine-epsilon scale +# to avoid misrouting mass between bins. +_GRID_ROUNDING_TOL = 10 * np.finfo(np.float64).eps + +# ============================================================================= +# PUBLIC API +# ============================================================================= + + +def geometric_convolve( + *, + dist_1: DenseDiscreteDist, + dist_2: DenseDiscreteDist, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Convolve two geometric-grid distributions. + + Algorithm 4 (`conv`) in Appendix C wrapper. + For POSITIVES-domain distributions the 0 atom is neutral (not absorbing), + so cross-terms (0 + finite and finite + 0) are added to the finite PMF. + """ + # Input validation + if not ( + isinstance(dist_1, DenseDiscreteDist) + and dist_1.spacing_type == SpacingType.GEOMETRIC + and dist_1.domain == Domain.POSITIVES + ) or not ( + isinstance(dist_2, DenseDiscreteDist) + and dist_2.spacing_type == SpacingType.GEOMETRIC + and dist_2.domain == Domain.POSITIVES + ): + raise TypeError( + "geometric_convolve requires geometric DenseDiscreteDist inputs on " + f"Domain.POSITIVES; got dist_1={type(dist_1).__name__} " + f"(spacing={dist_1.spacing_type}, domain={dist_1.domain}), " + f"dist_2={type(dist_2).__name__} " + f"(spacing={dist_2.spacing_type}, domain={dist_2.domain})" + ) + if tail_truncation < 0: + raise ValueError(f"tail_truncation must be non-negative, got {tail_truncation}") + + # Ensure both inputs share the same geometric log step. + if not stable_isclose(a=dist_1.step, b=dist_2.step): + raise ValueError( + "Geometric log steps must match: " + f"step_1={dist_1.step:.12g}, step_2={dist_2.step:.12g}" + ) + geom_step = dist_1.step + + # Core Numeric Convolution + x_out, pmf_conv = _compute_geometric_convolution( + x1=dist_1.x_array, + p1=dist_1.prob_arr, + x2=dist_2.x_array, + p2=dist_2.prob_arr, + geom_step=geom_step, + bound_type=bound_type, + ) + + # Add cross-terms from the 0 atom + x_out_0 = float(x_out[0]) + pmf_conv = _add_single_zero_atom_cross_term( + pmf_conv=pmf_conv, + x_arr=dist_2.x_array, + prob_arr=dist_2.prob_arr, + zero_prob=dist_1.p_min, + x_out_0=x_out_0, + geom_step=geom_step, + bound_type=bound_type, + ) + pmf_conv = _add_single_zero_atom_cross_term( + pmf_conv=pmf_conv, + x_arr=dist_1.x_array, + prob_arr=dist_1.prob_arr, + zero_prob=dist_2.p_min, + x_out_0=x_out_0, + geom_step=geom_step, + bound_type=bound_type, + ) + + expected_p_min, expected_p_max = convolve_boundary_masses( + dist_1.p_min, dist_1.p_max, dist_2.p_min, dist_2.p_max, dist_1.domain + ) + + pmf_conv, p_min, p_max = enforce_mass_conservation( + prob_arr=pmf_conv, + expected_p_min=expected_p_min, + expected_p_max=expected_p_max, + bound_type=bound_type, + ) + + return DenseDiscreteDist( + x_min=float(x_out[0]), + step=geom_step, + prob_arr=pmf_conv, + p_min=p_min, + p_max=p_max, + spacing_type=SpacingType.GEOMETRIC, + domain=Domain.POSITIVES, + ).truncate_edges(tail_truncation, bound_type) + + +def geometric_self_convolve( + *, + dist: DenseDiscreteDist, + T: int, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Self-convolve distribution T times using binary exponentiation.""" + # Input validation + if not (isinstance(dist, DenseDiscreteDist) and dist.spacing_type == SpacingType.GEOMETRIC): + raise TypeError(f"dist must be DenseDiscreteDist, got {type(dist)}") + validate_bound_type(bound_type) + if T < 1: + raise ValueError(f"T must be >= 1, got {T}") + if tail_truncation < 0: + raise ValueError(f"tail_truncation must be non-negative, got {tail_truncation}") + + self_conv = binary_self_convolve( + dist=dist, + T=T, + tail_truncation=tail_truncation, + bound_type=bound_type, + convolve=geometric_convolve, + ) + if not ( + isinstance(self_conv, DenseDiscreteDist) and self_conv.spacing_type == SpacingType.GEOMETRIC + ): + raise TypeError(f"Expected DenseDiscreteDist from self-convolution, got {type(self_conv)}") + return self_conv + + +# ============================================================================= +# INTERNAL KERNEL IMPLEMENTATION +# ============================================================================= + + +def _compute_geometric_convolution( + *, + x1: NDArray[np.float64], + p1: NDArray[np.float64], + x2: NDArray[np.float64], + p2: NDArray[np.float64], + geom_step: float, + bound_type: BoundType, +) -> tuple[NDArray[np.float64], NDArray[np.float64]]: + """Align grids, compute bin mapping parameters, and invoke the Numba kernel. + + Algorithm 4 (`conv`) with internal Algorithm 5 (`range-renorm`) in Appendix C. + """ + # --- A. Standardization (Swap & Pad) --- + # We normalize such that x_base (x1) starts at the lower value. + # This ensures scale = x2[0]/x1[0] >= 1, simplifying log calculations. + + # 1. Swap if necessary so x1[0] <= x2[0] + if x1[0] > x2[0]: + x1, p1, x2, p2 = x2, p2, x1, p1 + + # 2. Calculate Scale (Relative Offset) + scale = x2[0] / x1[0] + + # 3. Equalize Lengths (Right-Padding) + # The Numba kernel assumes arrays of equal length 'n'. + target_n = max(x1.size, x2.size) + if x1.size < target_n: + x1, p1 = _pad_right_geometric( + x=x1, + p=p1, + geom_step=geom_step, + target_n=target_n, + ) + elif x2.size < target_n: + x2, p2 = _pad_right_geometric( + x=x2, + p=p2, + geom_step=geom_step, + target_n=target_n, + ) + + # Convert to float64 for Numba compatibility + x_base = x1.astype(np.float64, copy=False) + pmf_base = p1.astype(np.float64, copy=False) + pmf_scaled = p2.astype(np.float64, copy=False) + + # --- B. Grid Mapping Parameters --- + n = x_base.size + + # Edge case: Single point + if n == 1: + mass = pmf_base[0] * pmf_scaled[0] + x_out = np.array([(scale + 1.0) * x_base[0]], dtype=np.float64) + pmf_out = np.array([mass], dtype=np.float64) + return x_out, pmf_out + + # Calculate shift parameters (delta) + log_r = geom_step + log_scale = np.log(scale) + log_ap1 = np.log(scale + 1.0) + + # Vectorized calculation for d=1..n-1 + d_vec = np.arange(n, dtype=np.float64) + log_r_d = d_vec * log_r + + log_lohi = np.logaddexp(0.0, log_scale + log_r_d) # log(1 + scale*r^d) + tau_lohi = (log_lohi - log_ap1) / log_r + + log_hilo = np.logaddexp(log_scale, log_r_d) # log(scale + r^d) + tau_hilo = (log_hilo - log_ap1) / log_r + + # Rounding strategy + delta_lohi = np.zeros(n, dtype=np.int64) + delta_hilo = np.zeros(n, dtype=np.int64) + rounding_eps = _GRID_ROUNDING_TOL + + if bound_type == BoundType.DOMINATES: + # Pessimistic: Round UP + delta_lohi[1:] = np.ceil(tau_lohi[1:] - rounding_eps).astype(np.int64) + delta_hilo[1:] = np.ceil(tau_hilo[1:] - rounding_eps).astype(np.int64) + elif bound_type == BoundType.IS_DOMINATED: + # Optimistic: Round DOWN + delta_lohi[1:] = np.floor(tau_lohi[1:] + rounding_eps).astype(np.int64) + delta_hilo[1:] = np.floor(tau_hilo[1:] + rounding_eps).astype(np.int64) + else: + raise ValueError(f"Unknown BoundType: {bound_type}") + + # --- C. Kernel Execution --- + pmf_out = _numba_geometric_kernel( + PMF_base=pmf_base, + PMF_scaled=pmf_scaled, + delta_lohi=delta_lohi, + delta_hilo=delta_hilo, + ) + + # Construct output X grid: x_out = x_base * (1 + scale) + x_out = x_base * (scale + 1.0) + + return x_out, pmf_out + + +def _pad_right_geometric( + *, + x: NDArray[np.float64], + p: NDArray[np.float64], + geom_step: float, + target_n: int, +) -> tuple[NDArray[np.float64], NDArray[np.float64]]: + """Extend grid to the right to reach target_n using geometric log step.""" + x = np.asarray(x, dtype=np.float64) + p = np.asarray(p, dtype=np.float64) + n = x.size + if n >= target_n: + return x, p + + k = target_n - n + tail = x[-1] * np.exp(geom_step * np.arange(1, k + 1, dtype=np.float64)) + + x_ext = np.concatenate([x, tail]) + p_ext = np.pad(p, (0, k), mode="constant") + return x_ext, p_ext + + +@njit(cache=True) +def _numba_geometric_kernel( + *, + PMF_base: NDArray[np.float64], + PMF_scaled: NDArray[np.float64], + delta_lohi: NDArray[np.int64], + delta_hilo: NDArray[np.int64], +) -> NDArray[np.float64]: + """Core convolution loop. + + Calculates Z = X + Y by iterating over the difference 'd' between indices. + + """ + n = PMF_base.size + pmf_out = np.zeros(n, dtype=np.float64) + comp = np.zeros(n, dtype=np.float64) + + for i in range(n): + mass = PMF_base[i] * PMF_scaled[i] + y = mass - comp[i] + t = pmf_out[i] + y + comp[i] = (t - pmf_out[i]) - y + pmf_out[i] = t + + for d in range(1, n): + imax = n - d + kshift1 = int(delta_lohi[d]) + kshift2 = int(delta_hilo[d]) + + for i in range(imax): + k1 = i + kshift1 + mass1 = PMF_base[i] * PMF_scaled[i + d] + if 0 <= k1 < n: + y = mass1 - comp[k1] + t = pmf_out[k1] + y + comp[k1] = (t - pmf_out[k1]) - y + pmf_out[k1] = t + + k2 = i + kshift2 + mass2 = PMF_base[i + d] * PMF_scaled[i] + if 0 <= k2 < n: + y = mass2 - comp[k2] + t = pmf_out[k2] + y + comp[k2] = (t - pmf_out[k2]) - y + pmf_out[k2] = t + + return pmf_out + + +def _add_single_zero_atom_cross_term( + *, + pmf_conv: NDArray[np.float64], + x_arr: NDArray[np.float64], + prob_arr: NDArray[np.float64], + zero_prob: float, + x_out_0: float, + geom_step: float, + bound_type: BoundType, +) -> NDArray[np.float64]: + """Map one family of 0+finite cross-terms onto the fixed output grid.""" + if zero_prob == 0.0: + return pmf_conv + + return _numba_add_single_zero_atom_cross_term( + pmf_out=pmf_conv, + x_vals=np.asarray(x_arr, dtype=np.float64), + prob_arr=np.asarray(prob_arr, dtype=np.float64), + zero_prob=float(zero_prob), + x_out_0=float(x_out_0), + log_r=float(geom_step), + dominates=(bound_type == BoundType.DOMINATES), + ) + + +@njit(cache=True) +def _numba_add_single_zero_atom_cross_term( + *, + pmf_out: NDArray[np.float64], + x_vals: NDArray[np.float64], + prob_arr: NDArray[np.float64], + zero_prob: float, + x_out_0: float, + log_r: float, + dominates: bool, +) -> NDArray[np.float64]: + """Core loop for one 0+finite cross-term family.""" + n = pmf_out.size + + for i in range(prob_arr.size): + weight = prob_arr[i] * zero_prob + if weight == 0.0: + continue + + x = x_vals[i] + if x <= 0.0: + continue + + frac_k = math.log(x / x_out_0) / log_r + if dominates: + k = int(math.ceil(frac_k - _GRID_ROUNDING_TOL)) + else: + k = int(math.floor(frac_k + _GRID_ROUNDING_TOL)) + + if 0 <= k < n: + pmf_out[k] += weight + elif k < 0 and dominates: + # Upper-bound rounding maps sub-grid mass to the first finite bin. + pmf_out[0] += weight + + return pmf_out diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_convolution_test.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_convolution_test.py new file mode 100644 index 00000000..46540296 --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_convolution_test.py @@ -0,0 +1,125 @@ +"""Tests for binary self-convolution on linear and geometric grids.""" + +import math + +import numpy as np +import pytest +from .random_allocation_convolution import fft_convolve, fft_self_convolve, geometric_convolve +from .random_allocation_distributions import DenseDiscreteDist, Domain +from .random_allocation_types import BoundType, SpacingType +from .random_allocation_utils import binary_self_convolve, log_geometric_to_linear + + +def _linear_dist(n: int = 5) -> DenseDiscreteDist: + x = np.linspace(0.0, 1.0, n) + pmf = np.ones(n, dtype=np.float64) / n + return DenseDiscreteDist.from_x_array(x_array=x, prob_arr=pmf) + + +def _geometric_dist(n: int = 6) -> DenseDiscreteDist: + x = np.geomspace(0.1, 1.0, n) + pmf = np.ones(n, dtype=np.float64) / n + return DenseDiscreteDist.from_x_array( + x_array=x, + prob_arr=pmf, + spacing_type=SpacingType.GEOMETRIC, + domain=Domain.POSITIVES, + ) + + +def test_binary_self_convolve_rejects_invalid_t(): + dist = _linear_dist() + with pytest.raises(ValueError, match="T must be >= 1"): + binary_self_convolve( + dist=dist, + T=0, + tail_truncation=0.0, + bound_type=BoundType.DOMINATES, + convolve=fft_convolve, + ) + + +def test_binary_self_convolve_t1_identity(): + dist = _linear_dist() + result = binary_self_convolve( + dist=dist, T=1, tail_truncation=0.0, bound_type=BoundType.DOMINATES, convolve=fft_convolve + ) + assert np.allclose(result.x_array, dist.x_array) + assert np.allclose(result.prob_arr, dist.prob_arr) + + +def test_binary_self_convolve_matches_direct_fft_t2(): + dist = _linear_dist() + result = binary_self_convolve( + dist=dist, T=2, tail_truncation=0.0, bound_type=BoundType.DOMINATES, convolve=fft_convolve + ) + direct = fft_convolve(dist_1=dist, dist_2=dist, tail_truncation=0.0, bound_type=BoundType.DOMINATES) + assert np.allclose(result.x_array, direct.x_array) + assert np.allclose(result.prob_arr, direct.prob_arr, atol=1e-12) + + +def test_binary_self_convolve_matches_repeated_geometric(): + dist = _geometric_dist() + result = binary_self_convolve( + dist=dist, + T=3, + tail_truncation=0.0, + bound_type=BoundType.DOMINATES, + convolve=geometric_convolve, + ) + repeated = geometric_convolve( + dist_1=dist, dist_2=dist, tail_truncation=0.0, bound_type=BoundType.DOMINATES + ) + repeated = geometric_convolve( + dist_1=repeated, dist_2=dist, tail_truncation=0.0, bound_type=BoundType.DOMINATES + ) + assert np.allclose(result.x_array, repeated.x_array) + assert np.allclose(result.prob_arr, repeated.prob_arr, atol=1e-12) + + +def test_geometric_convolve_preserves_step_for_linear_round_trip(): + step = 1e-4 + dist = DenseDiscreteDist( + x_min=1.0, + step=step, + prob_arr=np.array([0.5, 0.5], dtype=np.float64), + spacing_type=SpacingType.GEOMETRIC, + domain=Domain.POSITIVES, + ) + + result = geometric_convolve( + dist_1=dist, + dist_2=dist, + tail_truncation=0.0, + bound_type=BoundType.DOMINATES, + ) + + assert result.step == step + assert log_geometric_to_linear(result).step == step + + +def test_binary_self_convolve_preserves_mass_fft(): + dist = _linear_dist() + result = binary_self_convolve( + dist=dist, T=5, tail_truncation=0.0, bound_type=BoundType.DOMINATES, convolve=fft_convolve + ) + total = math.fsum([*map(float, result.prob_arr), result.p_min, result.p_max]) + assert np.isclose(total, 1.0, atol=1e-10) + + +def test_fft_self_convolve_direct_vs_binary(): + dist = _linear_dist(n=9) + direct = fft_self_convolve( + dist=dist, T=7, tail_truncation=0.0, bound_type=BoundType.DOMINATES, use_direct=True + ) + binary = fft_self_convolve( + dist=dist, T=7, tail_truncation=0.0, bound_type=BoundType.DOMINATES, use_direct=False + ) + + direct_mass = math.fsum([*map(float, direct.prob_arr), direct.p_min, direct.p_max]) + binary_mass = math.fsum([*map(float, binary.prob_arr), binary.p_min, binary.p_max]) + assert np.isclose(direct_mass, 1.0, atol=1e-10) + assert np.isclose(binary_mass, 1.0, atol=1e-10) + + assert direct.x_array.size >= 2 + assert binary.x_array.size >= 2 diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_core.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_core.py new file mode 100644 index 00000000..1777f848 --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_core.py @@ -0,0 +1,794 @@ +"""Random-allocation composition internals (bridge, realizations, Gaussian path).""" + +from __future__ import annotations + +from functools import partial +from typing import Callable + +import numpy as np +from dp_accounting.pld import privacy_loss_distribution +from dp_accounting.pld.pld_pmf import DensePLDPmf +from scipy import stats + +from .random_allocation_convolution import ( + fft_convolve, + fft_self_convolve, + geometric_convolve, + geometric_self_convolve, +) +from .random_allocation_distributions import ( + DenseDiscreteDist, + PLDRealization, + discretize_aligned_grid, + discretize_continuous_grid, + rediscretize_dist, +) +from .random_allocation_types import ( + BoundType, + Direction, + SpacingType, +) +from .random_allocation_utils import ( + calc_pld_dual, + exp_linear_to_geometric, + log_geometric_to_linear, + negate_reverse_linear_distribution, + validate_allocation_params, + validate_bound_type, + validate_discretization_params, +) + +def linear_dist_to_dp_accounting_pmf( + *, + dist: DenseDiscreteDist, + pessimistic_estimate: bool = True, +) -> DensePLDPmf: + """Convert a linear-grid loss PMF to a dp_accounting PMF. + + Args: + dist: Linear-grid loss distribution compatible with dp_accounting. + Must be a linear DenseDiscreteDist. ``x_min`` is rounded to the + nearest step multiple when forming dp_accounting's integer + ``lower_loss`` index. + pessimistic_estimate: Whether to use pessimistic estimate in dp_accounting. + + Returns: + dp_accounting DensePLDPmf with infinity mass taken from dist.p_max. + """ + if not (isinstance(dist, DenseDiscreteDist) and dist.spacing_type == SpacingType.LINEAR): + raise TypeError( + f"linear_dist_to_dp_accounting_pmf requires DenseDiscreteDist, got {type(dist)}." + ) + + base_index = int(np.rint(dist.x_min / dist.step)) + return DensePLDPmf( + discretization=dist.step, + lower_loss=base_index, + probs=dist.prob_arr.astype(np.float64), + infinity_mass=dist.p_max, + pessimistic_estimate=pessimistic_estimate, + ) + +def realization_remove_base_distributions( + *, + realization: PLDRealization, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, +) -> tuple[DenseDiscreteDist, DenseDiscreteDist]: + """Prepare remove-direction factors from a loss-space realization. + + Algorithm 1 (`rand-alloc-rem`) in Appendix C. + + Args: + realization: REMOVE-direction realization in linear loss space. + loss_discretization: Target linear-grid spacing. + tail_truncation: Tail truncation budget for regridding. + bound_type: Bound direction. + + Returns: + Tuple ``(base, dual_base)`` aligned to the requested linear grid. + + """ + # Since dual can be derived only from a PLD realization, discretization can + # come first for DOMINATES, but dual derivation must come first for IS_DOMINATED. + if bound_type == BoundType.DOMINATES: + # Avoid inflating the grid when the target is finer than the original one. + effective_disc = max(realization.step, loss_discretization) + coarsened_base = rediscretize_dist( + dist=realization, + tail_truncation=tail_truncation, + loss_discretization=effective_disc, + spacing_type=SpacingType.LINEAR, + bound_type=bound_type, + ) + if not ( + isinstance(coarsened_base, DenseDiscreteDist) + and coarsened_base.spacing_type == SpacingType.LINEAR + ): + _st = getattr(coarsened_base, "spacing_type", "?") + raise TypeError( + "Expected DenseDiscreteDist with LINEAR spacing, " + f"got {type(coarsened_base).__name__} with spacing {_st}" + ) + base_realization = PLDRealization.from_linear_dist(coarsened_base) + neg_dual_dist = negate_reverse_linear_distribution(calc_pld_dual(base_realization)) + return base_realization, neg_dual_dist + + # Lower-bound truncation can move left-tail mass into p_min and must consume + # any +inf mass before exp-space composition, so keep the lower path on the + # plain DenseDiscreteDist rediscretization route unconditionally. + dual_realization = calc_pld_dual(realization) + neg_dual_linear = negate_reverse_linear_distribution(dual_realization) + # Avoid inflating the grid when the target is finer than the original one. + effective_disc = max(realization.step, loss_discretization) + lower_realization_input = DenseDiscreteDist( + x_min=realization.x_min, + step=realization.step, + prob_arr=realization.prob_arr.copy(), + p_min=realization.p_min, + p_max=realization.p_max, + ) + lower_base_dist = rediscretize_dist( + dist=lower_realization_input, + tail_truncation=tail_truncation, + loss_discretization=effective_disc, + spacing_type=SpacingType.LINEAR, + bound_type=bound_type, + ) + if not ( + isinstance(lower_base_dist, DenseDiscreteDist) + and lower_base_dist.spacing_type == SpacingType.LINEAR + ): + _st = getattr(lower_base_dist, "spacing_type", "?") + raise TypeError( + "Expected DenseDiscreteDist with LINEAR spacing, " + f"got {type(lower_base_dist).__name__} with spacing {_st}" + ) + neg_dual_dist = rediscretize_dist( + dist=neg_dual_linear, + tail_truncation=tail_truncation, + loss_discretization=effective_disc, + spacing_type=SpacingType.LINEAR, + bound_type=bound_type, + ) + if not ( + isinstance(neg_dual_dist, DenseDiscreteDist) + and neg_dual_dist.spacing_type == SpacingType.LINEAR + ): + _st = getattr(neg_dual_dist, "spacing_type", "?") + raise TypeError( + "Expected DenseDiscreteDist with LINEAR spacing, " + f"got {type(neg_dual_dist).__name__} with spacing {_st}" + ) + return lower_base_dist, neg_dual_dist + + +def realization_add_base_distribution( + *, + realization: PLDRealization, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Prepare add-direction factors from a loss-space realization. + + Algorithm 2 (`rand-alloc-add`) in Appendix C. + + Args: + realization: ADD-direction realization in linear loss space. + loss_discretization: Target linear-grid spacing. + tail_truncation: Tail truncation budget for regridding. + bound_type: Bound direction. + + Returns: + One ADD loss factor aligned to the requested linear grid. + + """ + # Avoid inflating the grid when the target is finer than the original one. + effective_disc = max(realization.step, loss_discretization) + coarsened = rediscretize_dist( + dist=realization, + tail_truncation=tail_truncation, + loss_discretization=effective_disc, + spacing_type=SpacingType.LINEAR, + bound_type=bound_type, + ) + if not ( + isinstance(coarsened, DenseDiscreteDist) and coarsened.spacing_type == SpacingType.LINEAR + ): + _st = getattr(coarsened, "spacing_type", "?") + raise TypeError( + "Expected DenseDiscreteDist with LINEAR spacing, " + f"got {type(coarsened).__name__} with spacing {_st}" + ) + return coarsened + +# ============================================================================= +# Public API +# ============================================================================= + + +def allocation_full_pld( + *, + compute_base_pld_remove: Callable[..., DenseDiscreteDist], + compute_base_pld_add: Callable[..., DenseDiscreteDist], + num_steps: int, + num_selected: int, + num_epochs: int, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, +) -> privacy_loss_distribution.PrivacyLossDistribution: + """Orchestrate full allocation PLD construction for both directions. + + This function builds REMOVE and ADD directional PLDs via + ``allocation_directional_pld(...)`` and then converts them to the final + ``dp_accounting`` PLD object. + """ + # Input validation + validate_allocation_params(num_steps, num_selected, num_epochs) + validate_discretization_params(loss_discretization, tail_truncation) + validate_bound_type(bound_type) + + remove_dist = allocation_directional_pld( + compute_base_pld=compute_base_pld_remove, + num_steps=num_steps, + num_selected=num_selected, + num_epochs=num_epochs, + loss_discretization=loss_discretization, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + add_dist = allocation_directional_pld( + compute_base_pld=compute_base_pld_add, + num_steps=num_steps, + num_selected=num_selected, + num_epochs=num_epochs, + loss_discretization=loss_discretization, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + return _compose_full_pld( + remove_dist=remove_dist, + add_dist=add_dist, + bound_type=bound_type, + ) + + +def allocation_directional_pld( + *, + compute_base_pld: Callable[..., DenseDiscreteDist], + num_steps: int, + num_selected: int, + num_epochs: int, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Build one-direction allocation PLD with adaptive floor/ceil decomposition. + + For divisible ``num_steps / num_selected``, this builds one component. For + non-divisible cases, it builds floor and ceil components via + ``_allocation_directional_pld_core(...)`` and combines them with one final + ``fft_convolve(...)``. + """ + # Input validation + validate_allocation_params(num_steps, num_selected, num_epochs) + validate_discretization_params(loss_discretization, tail_truncation) + validate_bound_type(bound_type) + new_num_steps_floor = int(num_steps // num_selected) + if new_num_steps_floor < 1: + raise ValueError("num_steps must be >= num_selected") + num_epochs_remainder = num_steps - num_selected * new_num_steps_floor + new_num_steps_ceil = new_num_steps_floor + 1 + new_num_epochs_floor = (num_selected - num_epochs_remainder) * num_epochs + new_num_epochs_ceil = num_epochs_remainder * num_epochs + # Tail: active tail-consuming ops = one _allocation_directional_pld_core per component + # plus one fft_convolve when both components are active, giving 2*component_count - 1 + # ops total (1 for component_count=1, 3 for component_count=2). After + # tail_truncation /= (2*component_count - 1), each op consumes at most the rescaled + # budget, and all ops together sum to <= (2*component_count - 1) * rescaled = tail_truncation. + component_count = int(new_num_epochs_floor > 0) + int(new_num_epochs_ceil > 0) + tail_truncation /= 2 * component_count - 1 + + dist_floor = None + dist_ceil = None + if new_num_epochs_floor > 0: + dist_floor = _allocation_directional_pld_core( + compute_base_pld=compute_base_pld, + num_steps=new_num_steps_floor, + num_epochs=new_num_epochs_floor, + loss_discretization=loss_discretization, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + if new_num_epochs_ceil > 0: + dist_ceil = _allocation_directional_pld_core( + compute_base_pld=compute_base_pld, + num_steps=new_num_steps_ceil, + num_epochs=new_num_epochs_ceil, + loss_discretization=loss_discretization, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + + if dist_floor is None: + if dist_ceil is None: + raise RuntimeError( + "allocation_directional_pld failed to build either floor or ceil component" + ) + return dist_ceil + if dist_ceil is None: + return dist_floor + if dist_floor.step != dist_ceil.step: + raise ValueError( + "Cannot convolve floor and ceil allocation components with different " + f"grid steps: {dist_floor.step:.12g} vs {dist_ceil.step:.12g}." + ) + return fft_convolve( + dist_1=dist_floor, + dist_2=dist_ceil, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + + +def geometric_allocation_pld_base_remove( + *, + base_distributions_creation: Callable[..., tuple[DenseDiscreteDist, DenseDiscreteDist]], + num_steps: int, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Build the REMOVE component PLD via exp-space geometric composition. + + The callback ``base_distributions_creation`` provides one-step + ``(base, neg_dual_base)`` factors, which are shifted and composed. + """ + # Input validation + if num_steps < 1: + raise ValueError(f"num_steps must be >= 1, got {num_steps}") + validate_discretization_params(loss_discretization, tail_truncation) + validate_bound_type(bound_type) + # For num_steps > 1 there are active convolution stages beyond base construction. + # For num_steps == 1 neither convolution stages nor Phases 2/3 execute, so no + # tail-budget division is needed. + if num_steps > 1: + # Tail: three phases each receive tail_truncation / 3 after division. Contributions + # to output p_max (letting T = num_steps, budget/3 = rescaled per-phase share): + # Phase 1 (base_distributions_creation): base_factor_tail_truncation = budget/(3T). + # neg_dual is self-convolved T-1 times, base convolved once; total amplified + # contribution = ((T-1) + 1) * budget/(3T) = budget/3. + # Phase 2 (geometric_self_convolve, T-1): called with budget/3 -> <= budget/3. + # Phase 3 (geometric_convolve): called with budget/3 -> <= budget/3. + # Sum <= budget/3 + budget/3 + budget/3 = budget + tail_truncation /= 3 + # Each base factor is one of num_steps terms in the final product; its individual + # tail error is amplified by num_steps through self-convolution, so scale its budget + # down by num_steps so the amplified contribution stays within the phase budget. + base_factor_tail_truncation = tail_truncation / num_steps + + base, neg_dual_base = base_distributions_creation( + loss_discretization=loss_discretization, + tail_truncation=base_factor_tail_truncation, + bound_type=bound_type, + ) + + # For num_steps == 1 the centering shift is log(1) = 0 and the exp/log round-trip + # is an identity, so base is already the final result. + if num_steps == 1: + return base + + # Subtract the average loss + log_num_steps = float(np.log(num_steps)) + centered_neg_dual = DenseDiscreteDist( + x_min=neg_dual_base.x_min - log_num_steps, + step=neg_dual_base.step, + prob_arr=neg_dual_base.prob_arr.copy(), + p_min=neg_dual_base.p_min, + p_max=neg_dual_base.p_max, + ) + centered_base = DenseDiscreteDist( + x_min=base.x_min - log_num_steps, + step=base.step, + prob_arr=base.prob_arr.copy(), + p_min=base.p_min, + p_max=base.p_max, + ) + + # Factor preparation in exp-space. + exp_neg_dual = exp_linear_to_geometric(centered_neg_dual) + exp_base = exp_linear_to_geometric(centered_base) + + # V_{t-1} <- self-conv(V1, t-1, ...). + exp_convolved_dual = geometric_self_convolve( + dist=exp_neg_dual, + T=num_steps - 1, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + # U_t <- conv(V_{t-1}, U1, ...). + exp_convolved = geometric_convolve( + dist_1=exp_convolved_dual, + dist_2=exp_base, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + # L_t <- log(U_t). + return log_geometric_to_linear(exp_convolved) + + +def geometric_allocation_pld_base_add( + *, + base_distributions_creation: Callable[..., DenseDiscreteDist], + num_steps: int, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Build the ADD component PLD via exp-space geometric self-composition. + + The callback ``base_distributions_creation`` provides the one-step ADD + factor, which is shifted and composed before mapping back to linear loss. + """ + # Input validation + validate_discretization_params(loss_discretization, tail_truncation) + validate_bound_type(bound_type) + if num_steps < 1: + raise ValueError(f"num_steps must be >= 1, got {num_steps}") + + # For num_steps > 1 there are active convolution stages beyond base construction. + # For num_steps == 1 neither convolution stages nor Phase 2 execute, so no + # tail-budget division is needed. + if num_steps > 1: + # Tail: after dividing by 2, each of the two phases receives tail_truncation: + # Phase 1 (base creation): tail_truncation / num_steps per call; num_steps-fold + # self-convolution amplifies the contribution back to tail_truncation. + # Phase 2 (geometric_self_convolve, num_steps): tail_truncation directly. + # Sum: tail_truncation + tail_truncation = 2 * tail_truncation = original ✓ + tail_truncation /= 2 + # Each base factor's tail error is amplified by num_steps through self-convolution, + # so scale its budget down by num_steps so the amplified contribution stays within the + # phase budget. + base_factor_tail_truncation = tail_truncation / num_steps + + base = base_distributions_creation( + loss_discretization=loss_discretization, + tail_truncation=base_factor_tail_truncation, + bound_type=bound_type, + ) + + # For num_steps == 1 the centering shift is log(1) = 0 and the exp/log round-trip + # is an identity, so base is already the final result. + if num_steps == 1: + return base + + log_num_steps = float(np.log(num_steps)) + + neg_base = negate_reverse_linear_distribution(base) + centered_neg_base = DenseDiscreteDist( + x_min=neg_base.x_min - log_num_steps, + step=neg_base.step, + prob_arr=neg_base.prob_arr.copy(), + p_min=neg_base.p_min, + p_max=neg_base.p_max, + ) + + # Factor preparation in exp-space. + exp_base = exp_linear_to_geometric(centered_neg_base) + exp_bound_type = ( + BoundType.IS_DOMINATED if bound_type == BoundType.DOMINATES else BoundType.DOMINATES + ) + # U_t <- self-conv(U, t, lower). + exp_convolved = geometric_self_convolve( + dist=exp_base, + T=num_steps, + tail_truncation=tail_truncation, + bound_type=exp_bound_type, + ) + # L_t <- -log(U_t). + log_dist = log_geometric_to_linear(exp_convolved) + return negate_reverse_linear_distribution(log_dist) + + +def gaussian_allocation_pld_core( + *, + num_steps: int, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, + direction: Direction, + sigma: float, +) -> DenseDiscreteDist: + """Route one Gaussian component through the GEOM backend. + + This is the Gaussian-side orchestrator used by the shared allocation core. + """ + # Input validation + if num_steps < 1: + raise ValueError(f"num_steps must be >= 1, got {num_steps}") + if sigma <= 0: + raise ValueError(f"sigma must be positive, got {sigma}") + validate_discretization_params(loss_discretization, tail_truncation) + validate_bound_type(bound_type) + if direction not in (Direction.ADD, Direction.REMOVE): + raise ValueError(f"Invalid direction: {direction}") + + return _gaussian_allocation_geom( + num_steps=num_steps, + loss_discretization=loss_discretization, + tail_truncation=tail_truncation, + bound_type=bound_type, + direction=direction, + sigma=sigma, + ) + + +def _allocation_directional_pld_core( + *, + compute_base_pld: Callable[..., DenseDiscreteDist], + num_steps: int, + num_epochs: int, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Build and finalize one floor/ceil decomposition component. + + This function derives component-level budgets, calls + ``compute_base_pld(...)``, trims tails before and after epoch + composition, and returns the resulting linear distribution. + """ + # Tail: divide by 3; each of the three phases receives tail_truncation (the divided value): + # Phase 1 (base creation, amplified by num_epochs): base construction and + # truncate_edges each use base_tail_truncation. Amplified total: + # num_epochs * 2 * tail_truncation / (2 * num_epochs) = tail_truncation. + # Phase 2 (fft_self_convolve, T=num_epochs): tail_truncation directly + # (skipped if num_epochs=1). + # Phase 3 (final truncate_edges): tail_truncation directly. + # Sum: tail_truncation + tail_truncation + tail_truncation = 3 * tail_truncation = original ✓ + tail_truncation /= 3 + base_tail_truncation = tail_truncation / (2 * num_epochs) + + base_dist = compute_base_pld( + num_steps=num_steps, + loss_discretization=loss_discretization, + tail_truncation=base_tail_truncation, + bound_type=bound_type, + ) + prepared_base_dist = base_dist.truncate_edges( + tail_truncation=base_tail_truncation, + bound_type=bound_type, + ) + if not ( + isinstance(prepared_base_dist, DenseDiscreteDist) + and prepared_base_dist.spacing_type == SpacingType.LINEAR + ): + _st = getattr(prepared_base_dist, "spacing_type", "?") + raise TypeError( + "Expected DenseDiscreteDist with LINEAR spacing, " + f"got {type(prepared_base_dist).__name__} with spacing {_st}" + ) + + if num_epochs == 1: + composed_dist = prepared_base_dist + else: + composed_dist = fft_self_convolve( + dist=prepared_base_dist, + T=num_epochs, + tail_truncation=tail_truncation, + bound_type=bound_type, + use_direct=True, + ) + final_dist = composed_dist.truncate_edges( + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + if not ( + isinstance(final_dist, DenseDiscreteDist) and final_dist.spacing_type == SpacingType.LINEAR + ): + _st = getattr(final_dist, "spacing_type", "?") + raise TypeError( + "Expected DenseDiscreteDist with LINEAR spacing, " + f"got {type(final_dist).__name__} with spacing {_st}" + ) + return final_dist + + +def _compose_full_pld( + *, + remove_dist: DenseDiscreteDist | None, + add_dist: DenseDiscreteDist | None, + bound_type: BoundType, +) -> privacy_loss_distribution.PrivacyLossDistribution: + """Convert remove/add directional PLDs into a ``dp_accounting`` PLD. + + Args: + remove_dist: REMOVE-direction linear PLD. + add_dist: Optional ADD-direction linear PLD. + bound_type: Bound direction used for pessimistic conversion. + + Returns: + A ``dp_accounting`` privacy loss distribution. + + """ + if remove_dist is None: + raise ValueError( + "PLD construction requires remove-direction distribution. " + "Provide remove_realization or use both directions." + ) + pessimistic_estimate = bound_type == BoundType.DOMINATES + pmf_remove = linear_dist_to_dp_accounting_pmf( + dist=remove_dist, + pessimistic_estimate=pessimistic_estimate, + ) + if add_dist is None: + return privacy_loss_distribution.PrivacyLossDistribution( + pmf_remove=pmf_remove, + ) + pmf_add = linear_dist_to_dp_accounting_pmf( + dist=add_dist, + pessimistic_estimate=pessimistic_estimate, + ) + return privacy_loss_distribution.PrivacyLossDistribution( + pmf_remove=pmf_remove, + pmf_add=pmf_add, + ) + + +# ============================================================================= +# Internal GEOM Route +# ============================================================================= + + +def _gaussian_allocation_geom( + *, + num_steps: int, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, + direction: Direction, + sigma: float, +) -> DenseDiscreteDist: + """GEOM path intentionally mirrors realization path after base creation. + + Both call geometric_allocation_PLD_base_* with identical wiring. + + """ + if direction == Direction.ADD: + return geometric_allocation_pld_base_add( + base_distributions_creation=partial( + _gaussian_add_geom_loss_factor, + sigma=sigma, + ), + num_steps=num_steps, + loss_discretization=loss_discretization, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + if direction == Direction.REMOVE: + return geometric_allocation_pld_base_remove( + base_distributions_creation=partial( + _gaussian_remove_geom_loss_factors, + sigma=sigma, + ), + num_steps=num_steps, + loss_discretization=loss_discretization, + tail_truncation=tail_truncation, + bound_type=bound_type, + ) + raise ValueError(f"Invalid direction: {direction}") + + +def _gaussian_remove_geom_loss_factors( + *, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, + sigma: float, +) -> tuple[DenseDiscreteDist, DenseDiscreteDist]: + """Build REMOVE GEOM one-step PLD factors as ``(base, dual_base)``.""" + sigma_inv = 1.0 / sigma + factor_tail_truncation = tail_truncation / 2 + + dual_norm_mean = -(sigma_inv**2) / 2 + base_norm_mean = sigma_inv**2 / 2 + exp_dual = stats.lognorm(s=sigma_inv, scale=np.exp(dual_norm_mean)) + exp_base = stats.lognorm(s=sigma_inv, scale=np.exp(base_norm_mean)) + + geom_step = float(loss_discretization) + dual_x_min = float(exp_dual.ppf(factor_tail_truncation)) + dual_x_max = float(exp_dual.isf(factor_tail_truncation)) + base_x_min = float(exp_base.ppf(factor_tail_truncation)) + base_x_max = float(exp_base.isf(factor_tail_truncation)) + dual_grid = discretize_aligned_grid( + x_min=dual_x_min, + x_max=dual_x_max, + spacing_type=SpacingType.GEOMETRIC, + align_to_multiples=True, + discretization=geom_step, + ) + base_grid = discretize_aligned_grid( + x_min=base_x_min, + x_max=base_x_max, + spacing_type=SpacingType.GEOMETRIC, + align_to_multiples=True, + discretization=geom_step, + ) + + dual_factor_dist = discretize_continuous_grid( + dist=exp_dual, + grid=dual_grid, + bound_type=bound_type, + PMF_min_increment=factor_tail_truncation, + ) + if not ( + isinstance(dual_factor_dist, DenseDiscreteDist) + and dual_factor_dist.spacing_type == SpacingType.GEOMETRIC + ): + _st = getattr(dual_factor_dist, "spacing_type", "?") + raise TypeError( + "Expected DenseDiscreteDist with GEOMETRIC spacing, " + f"got {type(dual_factor_dist).__name__} with spacing {_st}" + ) + + base_factor_dist = discretize_continuous_grid( + dist=exp_base, + grid=base_grid, + bound_type=bound_type, + PMF_min_increment=factor_tail_truncation, + ) + if not ( + isinstance(base_factor_dist, DenseDiscreteDist) + and base_factor_dist.spacing_type == SpacingType.GEOMETRIC + ): + _st = getattr(base_factor_dist, "spacing_type", "?") + raise TypeError( + "Expected DenseDiscreteDist with GEOMETRIC spacing, " + f"got {type(base_factor_dist).__name__} with spacing {_st}" + ) + + dual_loss_factor = log_geometric_to_linear(dual_factor_dist) + base_loss_factor = log_geometric_to_linear(base_factor_dist) + # geometric_allocation_pld_base_remove expects (base, dual_base). + return base_loss_factor, dual_loss_factor + + +def _gaussian_add_geom_loss_factor( + *, + loss_discretization: float, + tail_truncation: float, + bound_type: BoundType, + sigma: float, +) -> DenseDiscreteDist: + """Build ADD GEOM one-step linear PLD factor.""" + sigma_inv = 1.0 / sigma + + base_lognorm = stats.lognorm(s=sigma_inv, scale=np.exp(+(sigma_inv**2) / 2)) + geom_step = float(loss_discretization) + + base_x_min = float(base_lognorm.ppf(tail_truncation)) + base_x_max = float(base_lognorm.isf(tail_truncation)) + base_grid = discretize_aligned_grid( + x_min=base_x_min, + x_max=base_x_max, + spacing_type=SpacingType.GEOMETRIC, + align_to_multiples=True, + discretization=geom_step, + ) + base_dist = discretize_continuous_grid( + dist=base_lognorm, + grid=base_grid, + bound_type=bound_type, + PMF_min_increment=tail_truncation, + ) + if not ( + isinstance(base_dist, DenseDiscreteDist) and base_dist.spacing_type == SpacingType.GEOMETRIC + ): + raise TypeError( + f"Expected DenseDiscreteDist with GEOMETRIC spacing, " + f"got {type(base_dist).__name__} with spacing {getattr(base_dist, 'spacing_type', '?')}" + ) + return log_geometric_to_linear(base_dist) diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_distributions.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_distributions.py new file mode 100644 index 00000000..521ac13d --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_distributions.py @@ -0,0 +1,1390 @@ +"""Discrete distributions, PMF utilities, and grid discretization.""" + +from __future__ import annotations + +import math +from abc import ABC, abstractmethod +from enum import Enum +from typing import Any + +import numpy as np +from numba import njit +from numpy.typing import NDArray +from scipy import stats +from scipy.stats._distn_infrastructure import rv_frozen +from typing_extensions import Self + +from .random_allocation_types import ( + BoundType, + RegularGrid, + SpacingType, + validate_discrete_pmf_and_boundaries, +) + +PMF_MASS_TOL = 10 * np.finfo(float).eps # total-mass tolerance (10× machine epsilon) +RENORMALIZATION_THRESHOLD = 10 * np.finfo(float).eps +REALIZATION_MOMENT_TOL = 1e-12 +SPACING_ATOL = 1e-12 +SPACING_RTOL = 1e-6 +MIN_GRID_SIZE = 100 # Minimum number of points in a discretization grid. +MAX_SAFE_EXP_ARG = math.log(np.finfo(np.float64).max) +TAIL_SWITCH = 1e-10 + + +class Domain(Enum): + """Domain of a discrete distributsion's support.""" + + REALS = "reals" # p_min = mass at −∞, p_max = mass at +∞ + POSITIVES = "positives" # p_min = mass at 0, p_max = mass at +∞ + + +# ============================================================================= +# ABSTRACT BASE +# ============================================================================= + + +class DiscreteDistBase(ABC): + """Abstract base for discrete PMF representations with boundary masses. + + Attributes: + prob_arr: probability mass on finite support + p_min: mass at the lower boundary (−∞ for REALS, 0 for POSITIVES) + p_max: mass at +∞ + domain: whether the support is over the reals or positive numbers + """ + + def __init__( + self, + prob_arr: NDArray[np.float64], + p_min: float = 0.0, + p_max: float = 0.0, + domain: Domain = Domain.REALS, + ) -> None: + """Initialize discrete distribution with PMF array and boundary masses.""" + self.prob_arr = np.asarray(prob_arr, dtype=np.float64) + self.p_min = float(p_min) + self.p_max = float(p_max) + self.domain = domain + self._validate_basic() + + @abstractmethod + def get_x_array(self) -> NDArray[np.float64]: + """Return materialized support points.""" + + @property + def x_array(self) -> NDArray[np.float64]: + """Materialized support.""" + return self.get_x_array() + + def _validate_basic(self) -> None: + validate_discrete_pmf_and_boundaries( + self.prob_arr, + self.p_min, + self.p_max, + ) + + pmf_sum = math.fsum(map(float, self.prob_arr)) + total_mass = pmf_sum + self.p_min + self.p_max + mass_error = abs(total_mass - 1.0) + if mass_error > PMF_MASS_TOL: + error_msg = "MASS CONSERVATION ERROR" + error_msg += f": Error={mass_error:.2e} (tolerance={PMF_MASS_TOL:.2e})" + error_msg += f", PMF sum={pmf_sum:.15f}" + error_msg += f", min={self.p_min:.2e}" + error_msg += f", max={self.p_max:.2e}" + error_msg += f", Total mass={total_mass:.15f}" + raise ValueError(error_msg) + + # REALS domain: both boundaries being non-zero is not allowed. + if self.domain == Domain.REALS and self.p_min > PMF_MASS_TOL and self.p_max > PMF_MASS_TOL: + raise ValueError("REALS domain: p_min and p_max cannot both be non-zero") + + def truncate_edges(self, tail_truncation: float, bound_type: BoundType) -> Self: + """Truncate distribution edges. Computation lives in distribution_utils.""" + new_prob_arr, new_p_min, new_p_max, min_ind, max_ind = compute_truncation( + self.prob_arr, self.p_min, self.p_max, tail_truncation, bound_type + ) + return self._create_truncated(new_prob_arr, new_p_min, new_p_max, min_ind, max_ind) + + @abstractmethod + def _create_truncated( + self, + new_prob_arr: NDArray[np.float64], + new_p_min: float, + new_p_max: float, + min_ind: int, + max_ind: int, + ) -> Self: + """Create truncated instance preserving representation semantics.""" + + @abstractmethod + def copy(self) -> Self: + """Deep-copy this distribution while preserving representation type.""" + + +# ============================================================================= +# GENERAL (EXPLICIT) DISTRIBUTION +# ============================================================================= + + +class SparseDiscreteDist(DiscreteDistBase): + """General discrete distribution with explicit support values.""" + + def __init__( + self, + x_array: NDArray[np.float64], + prob_arr: NDArray[np.float64], + p_min: float = 0.0, + p_max: float = 0.0, + domain: Domain = Domain.REALS, + ) -> None: + """Initialize general discrete distribution with explicit support points.""" + self._x_array = np.asarray(x_array, dtype=np.float64) + super().__init__(prob_arr, p_min, p_max, domain) + self._validate_x_array() + + def _validate_x_array(self) -> None: + if self._x_array.ndim != 1 or self._x_array.shape != self.prob_arr.shape: + raise ValueError("x and PMF must be 1-D arrays of equal length") + if not np.all(np.diff(self._x_array) > 0): + raise ValueError("x must be strictly increasing") + + def get_x_array(self) -> NDArray[np.float64]: + """Return materialized support points.""" + return self._x_array + + def _create_truncated( + self, + new_prob_arr: NDArray[np.float64], + new_p_min: float, + new_p_max: float, + min_ind: int, + max_ind: int, + ) -> SparseDiscreteDist: + return SparseDiscreteDist( + x_array=self._x_array[slice(min_ind, max_ind + 1)].copy(), + prob_arr=new_prob_arr, + p_min=new_p_min, + p_max=new_p_max, + domain=self.domain, + ) + + def copy(self) -> SparseDiscreteDist: + """Create a deep copy of this distribution.""" + return SparseDiscreteDist( + x_array=self._x_array.copy(), + prob_arr=self.prob_arr.copy(), + p_min=self.p_min, + p_max=self.p_max, + domain=self.domain, + ) + + +# ============================================================================= +# UNIFIED REGULAR-GRID DISTRIBUTION +# ============================================================================= + + +class DenseDiscreteDist(DiscreteDistBase): + """Discrete distribution on a regular (linear or geometric) grid. + + spacing_type = LINEAR: x[i] = x_min + i * step (step = additive gap > 0) + spacing_type = GEOMETRIC: x[i] = x_min * exp(i * step) (step = log-ratio > 0) + + For geometric grids the domain is always POSITIVES (x_min > 0 enforces positivity). + """ + + def __init__( + self, + x_min: float, + step: float, + prob_arr: NDArray[np.float64], + p_min: float = 0.0, + p_max: float = 0.0, + spacing_type: SpacingType = SpacingType.LINEAR, + domain: Domain = Domain.REALS, + grid: RegularGrid | None = None, + ) -> None: + """Initialize regular-grid discrete distribution.""" + prob_arr = np.asarray(prob_arr, dtype=np.float64) + if grid is None: + grid = RegularGrid( + x_min=float(x_min), + step=float(step), + size=prob_arr.size, + spacing_type=spacing_type, + ) + elif grid.size != prob_arr.size: + raise ValueError( + f"Grid size must match PMF size, got grid.size={grid.size}, " + f"prob_arr.size={prob_arr.size}" + ) + self.grid = grid + super().__init__(prob_arr, p_min, p_max, domain) + self._validate_grid() + + @classmethod + def from_grid( + cls, + *, + grid: RegularGrid, + prob_arr: NDArray[np.float64], + p_min: float = 0.0, + p_max: float = 0.0, + domain: Domain = Domain.REALS, + ) -> "DenseDiscreteDist": + """Create DenseDiscreteDist from exact regular-grid metadata.""" + return cls( + x_min=grid.x_min, + step=grid.step, + prob_arr=prob_arr, + p_min=p_min, + p_max=p_max, + spacing_type=grid.spacing_type, + domain=domain, + grid=grid, + ) + + @property + def x_min(self) -> float: + return self.grid.x_min + + @property + def step(self) -> float: + return self.grid.step + + @property + def spacing_type(self) -> SpacingType: + return self.grid.spacing_type + + def _validate_grid(self) -> None: + if self.spacing_type == SpacingType.LINEAR: + if self.step <= 0.0: + raise ValueError("step must be positive for linear grid") + elif self.spacing_type == SpacingType.GEOMETRIC: + if self.x_min <= 0.0: + raise ValueError("x_min must be positive for geometric grid") + if self.step <= 0.0: + raise ValueError("step must be positive for geometric grid") + if self.domain != Domain.POSITIVES: + raise ValueError("Geometric spacing requires domain=Domain.POSITIVES") + else: + raise ValueError(f"Unknown SpacingType: {self.spacing_type}") + + @classmethod + def from_x_array( + cls, + x_array: NDArray[np.float64], + prob_arr: NDArray[np.float64], + p_min: float = 0.0, + p_max: float = 0.0, + spacing_type: SpacingType = SpacingType.LINEAR, + domain: Domain = Domain.REALS, + ) -> "DenseDiscreteDist": + """Create DenseDiscreteDist from x_array by extracting x_min and step.""" + if spacing_type == SpacingType.LINEAR: + step = compute_bin_width(x_array) + else: + step = compute_bin_log_ratio(x_array) + return cls( + x_min=float(x_array[0]), + step=step, + prob_arr=prob_arr, + p_min=p_min, + p_max=p_max, + spacing_type=spacing_type, + domain=domain, + ) + + def get_x_array(self) -> NDArray[np.float64]: + """Return materialized support points.""" + return self.grid.x_array + + def _create_truncated( + self, + new_prob_arr: NDArray[np.float64], + new_p_min: float, + new_p_max: float, + min_ind: int, + max_ind: int, + ) -> "DenseDiscreteDist": + if self.spacing_type == SpacingType.LINEAR: + new_grid = RegularGrid( + x_min=self.x_min + min_ind * self.step, + step=self.step, + size=new_prob_arr.size, + spacing_type=self.spacing_type, + ) + else: + new_grid = RegularGrid( + x_min=self.x_min * float(np.exp(self.step * min_ind)), + step=self.step, + size=new_prob_arr.size, + spacing_type=self.spacing_type, + ) + return self.__class__( + x_min=new_grid.x_min, + step=new_grid.step, + prob_arr=new_prob_arr, + p_min=new_p_min, + p_max=new_p_max, + spacing_type=self.spacing_type, + domain=self.domain, + grid=new_grid, + ) + + def copy(self) -> "DenseDiscreteDist": + """Create a deep copy of this distribution.""" + return self.__class__( + x_min=self.grid.x_min, + step=self.grid.step, + prob_arr=self.prob_arr.copy(), + p_min=self.p_min, + p_max=self.p_max, + spacing_type=self.spacing_type, + domain=self.domain, + grid=self.grid, + ) + + +# ============================================================================= +# PLD REALIZATION +# ============================================================================= + + +class PLDRealization(DenseDiscreteDist): + """Linear-grid PLD realization in loss space.""" + + def __init__( + self, + x_min: float, + step: float, + prob_arr: NDArray[np.float64], + p_min: float = 0.0, + p_max: float = 0.0, + ) -> None: + """Initialize PLD realization with privacy loss values and probabilities.""" + super().__init__( + x_min=x_min, + step=step, + prob_arr=prob_arr, + p_min=float(p_min), + p_max=float(p_max), + spacing_type=SpacingType.LINEAR, + domain=Domain.REALS, + ) + self._validate_pld_realization() + + @classmethod + def from_linear_dist(cls, dist: DenseDiscreteDist) -> "PLDRealization": + """Build a validated PLD realization from a linear-grid DenseDiscreteDist.""" + if not isinstance(dist, DenseDiscreteDist) or dist.spacing_type != SpacingType.LINEAR: + raise TypeError( + f"from_linear_dist requires DenseDiscreteDist with LINEAR spacing, got {type(dist)}" + ) + return cls( + x_min=dist.x_min, + step=dist.step, + prob_arr=dist.prob_arr, + p_max=dist.p_max, + p_min=dist.p_min, + ) + + def _validate_pld_realization(self) -> None: + """Validate the properties of PLD-realization. + + 1. p(-inf) = 0 (p_min = 0). + 2. E[e^(-X)] <= 1. + """ + # PLD realizations must have zero mass at negative-infinity loss. + if self.p_min > PMF_MASS_TOL: + raise ValueError(f"PLD realization requires p_min = 0, got {self.p_min:.2e}") + + exp_moment_val = exp_moment_terms(prob_arr=self.prob_arr, x_vals=self.x_array) + if np.any(np.isinf(exp_moment_val)): + raise ValueError( + "Exponential moment E[exp(-L)] is infinite, not a valid PLD realization" + ) + exp_moment_total = math.fsum(map(float, exp_moment_val)) + if exp_moment_total > 1.0 + REALIZATION_MOMENT_TOL: + raise ValueError( + f"Exponential moment E[exp(-L)] = {exp_moment_total:.15f} > 1.0, " + "not a valid PLD realization" + ) + + def copy(self) -> "PLDRealization": + """Create a deep copy of this PLD realization.""" + return PLDRealization( + x_min=self.x_min, + step=self.step, + prob_arr=self.prob_arr.copy(), + p_max=self.p_max, + p_min=self.p_min, + ) + + def truncate_edges( # type: ignore[override] + self, tail_truncation: float, bound_type: BoundType + ) -> DenseDiscreteDist: + if bound_type == BoundType.IS_DOMINATED: + # IS_DOMINATED can set p_min > 0, violating PLDRealization.p_min = 0. + # Delegate through a plain DenseDiscreteDist so the result is not a PLDRealization. + return DenseDiscreteDist( + x_min=self.x_min, + step=self.step, + prob_arr=self.prob_arr.copy(), + p_min=self.p_min, + p_max=self.p_max, + ).truncate_edges(tail_truncation, bound_type) + return super().truncate_edges(tail_truncation, bound_type) + + def _create_truncated( + self, + new_prob_arr: NDArray[np.float64], + new_p_min: float, + new_p_max: float, + min_ind: int, + max_ind: int, + ) -> "PLDRealization": + """Create a truncated PLD realization while preserving linear-loss semantics.""" + del max_ind + return PLDRealization( + x_min=self.x_min + min_ind * self.step, + step=self.step, + prob_arr=new_prob_arr, + p_min=new_p_min, + p_max=new_p_max, + ) + +# ============================================================================= +# Public Utility Functions +# ============================================================================= + + +def enforce_mass_conservation( + *, + prob_arr: NDArray[np.float64], + expected_p_min: float, + expected_p_max: float, + bound_type: BoundType, +) -> tuple[NDArray[np.float64], float, float]: + """Enforce total mass with one bound-type-selected boundary held fixed. + + - ``DOMINATES`` enforces ``expected_p_max``. + - ``IS_DOMINATED`` enforces ``expected_p_min``. + + Excess mass is removed directionally over an extended array that includes the + opposite boundary, matching the truncation logic: + - ``DOMINATES`` trims from the left over ``[p_min, *prob_arr]``. + - ``IS_DOMINATED`` trims from the right over ``[*prob_arr, p_max]``. + + Any remaining slack is assigned to the enforced boundary. + """ + prob_arr = np.asarray(prob_arr, dtype=np.float64).copy() + validate_discrete_pmf_and_boundaries( + prob_arr, + expected_p_min, + expected_p_max, + ) + + total_mass = math.fsum(map(float, prob_arr)) + expected_p_min + expected_p_max + if total_mass <= 0.0: + raise ValueError("Cannot enforce mass conservation with zero total mass") + + if bound_type == BoundType.DOMINATES: + if expected_p_max > 1.0: + raise ValueError("Expected p_max cannot exceed 1") + extended = np.concatenate(([expected_p_min], prob_arr)) + target_mass = 1.0 - expected_p_max + current_mass = math.fsum(map(float, extended)) + excess = current_mass - target_mass + if excess > 0: + if excess < RENORMALIZATION_THRESHOLD: + # Tiny excess (numerical noise): renormalize instead of trimming bins + extended = extended * (target_mass / current_mass) + else: + extended = _zero_mass(values=extended, mass=excess, from_left=True, exact=True) + current_mass = math.fsum(map(float, extended)) + return ( + extended[1:].copy(), + float(extended[0]), + expected_p_max + max(0.0, target_mass - current_mass), + ) + + if bound_type == BoundType.IS_DOMINATED: + if expected_p_min > 1.0: + raise ValueError("Expected p_min cannot exceed 1") + extended = np.concatenate((prob_arr, [expected_p_max])) + target_mass = 1.0 - expected_p_min + current_mass = math.fsum(map(float, extended)) + excess = current_mass - target_mass + if excess > 0: + if excess < RENORMALIZATION_THRESHOLD: + # Tiny excess (numerical noise): renormalize instead of trimming bins + extended = extended * (target_mass / current_mass) + else: + extended = _zero_mass(values=extended, mass=excess, from_left=False, exact=True) + current_mass = math.fsum(map(float, extended)) + return ( + extended[:-1].copy(), + expected_p_min + max(0.0, target_mass - current_mass), + float(extended[-1]), + ) + + raise ValueError( + f"Invalid bound_type: {bound_type}. Must be BoundType.DOMINATES or BoundType.IS_DOMINATED." + ) + + +def compute_bin_width_two_arrays( + *, x_array_1: NDArray[np.float64], x_array_2: NDArray[np.float64] +) -> float: + """Compute linear spacing width for two grids and return their average.""" + w1 = compute_bin_width(x_array_1) + w2 = compute_bin_width(x_array_2) + if not stable_isclose(a=w1, b=w2): + raise ValueError(f"Grid spacing must match: w1={w1:.12g} vs w2={w2:.12g}") + return (w1 + w2) / 2 + + +# ============================================================================= +# Grid Spacing Utilities +# ============================================================================= + + +def compute_bin_log_ratio(x_array: NDArray[np.float64]) -> float: + """Compute geometric log-ratio spacing for a grid.""" + if x_array.size < 2: + raise ValueError("Cannot compute geometric bin ratio with less than 2 bins") + if np.any(x_array <= 0): + raise ValueError("Cannot compute geometric bin ratio for non-positive values") + log_x = np.log(x_array) + log_ratio = float((log_x[-1] - log_x[0]) / (x_array.size - 1)) + diffs = np.diff(log_x) + if not np.allclose(log_ratio, diffs, rtol=SPACING_RTOL, atol=SPACING_ATOL): + max_diff = np.max(np.abs(log_ratio - diffs)) + raise ValueError( + "Distribution has non-uniform bin widths: " + f"log_ratio={log_ratio}, max_diff={max_diff}" + ) + return log_ratio + + +def compute_bin_width(x_array: NDArray[np.float64]) -> float: + """Compute linear spacing width for a grid.""" + if x_array.size < 2: + raise ValueError("Cannot compute width with less than 2 bins") + diffs = np.diff(x_array) + median_diff = np.median(diffs) + if not np.allclose(median_diff, diffs, rtol=SPACING_RTOL, atol=SPACING_ATOL): + max_diff = np.max(np.abs(median_diff - diffs)) + raise ValueError( + "Distribution has non-uniform bin widths: " + f"median_diff={median_diff}, max diff={max_diff}" + ) + return float(median_diff) + + +def stable_isclose(*, a: float, b: float) -> bool: + """Consistent closeness check using shared spacing tolerances.""" + return bool(np.isclose(a, b, rtol=SPACING_RTOL, atol=SPACING_ATOL)) + + +def stable_array_equal(*, a: NDArray[np.float64], b: NDArray[np.float64]) -> bool: + """Consistent array closeness check using shared spacing tolerances.""" + return a.shape == b.shape and np.allclose(a, b, rtol=SPACING_RTOL, atol=SPACING_ATOL) + + +def exp_moment_terms( + *, + prob_arr: NDArray[np.float64], + x_vals: NDArray[np.float64], +) -> NDArray[np.float64]: + """Return per-bin contributions to ``E[exp(-X)]``. + + For very negative ``x_vals`` the naive product ``p * exp(-x)`` can overflow + even when the combined term is representable. In that regime we evaluate the + contribution as ``exp(log(p) - x)`` instead. + + Terms that still exceed float64 range are returned as ``inf``. + """ + prob_arr = np.asarray(prob_arr, dtype=np.float64) + x_vals = np.asarray(x_vals, dtype=np.float64) + if prob_arr.shape != x_vals.shape: + raise ValueError("prob_arr and x_vals must have the same shape") + + terms = np.zeros_like(prob_arr, dtype=np.float64) + positive_mask = prob_arr > 0.0 + safe_mask = positive_mask & (x_vals >= -MAX_SAFE_EXP_ARG) + if np.any(safe_mask): + terms[safe_mask] = prob_arr[safe_mask] * np.exp(-x_vals[safe_mask]) + + extreme_mask = positive_mask & (x_vals < -MAX_SAFE_EXP_ARG) + if np.any(extreme_mask): + log_terms = np.log(prob_arr[extreme_mask]) - x_vals[extreme_mask] + terms_extreme = np.exp(np.minimum(log_terms, MAX_SAFE_EXP_ARG)) + terms_extreme[log_terms > MAX_SAFE_EXP_ARG] = np.inf + terms[extreme_mask] = terms_extreme + + return terms + + +# ============================================================================= +# Distribution Edge Truncation +# ============================================================================= + + +def compute_truncation( + prob_arr: NDArray[np.float64], + p_min: float, + p_max: float, + tail_truncation: float, + bound_type: BoundType, +) -> tuple[NDArray[np.float64], float, float, int, int]: + """Compute truncated distribution parameters without creating objects. + + Algorithm: + A. Remove leading/trailing zeros from PMF (always done). + B. If tail_truncation > 0: + - Compute how much to consume from each side (up to tail_truncation). + - For DOMINATES: Operate over the [p_min, *prob_arr] range. + Left tail folds into first remaining element; + right tail goes to p_max. + - For IS_DOMINATED: Operate over the [*prob_arr, p_max] range. + Right tail folds into last remaining element; + left tail goes to p_min; + C. Apply step A again to remove any newly created leading/trailing zeros. + + Returns: + (new_prob_arr, new_p_min, new_p_max, min_ind, max_ind) where min_ind and + max_ind are indices into the original prob_arr. + """ + # Remove zero probability tails to reduce unnecessary computations + inner_min, inner_max = _strip_zero_edges(prob_arr) + trimmed_prob_arr = prob_arr[slice(inner_min, inner_max + 1)].copy() + + if tail_truncation == 0.0: + return trimmed_prob_arr.copy(), p_min, p_max, inner_min, inner_max + + if bound_type == BoundType.DOMINATES: + extended_prob = np.concatenate([[p_min], trimmed_prob_arr]) + original_mass = math.fsum(map(float, extended_prob)) + # Truncate left tail and add its mass to the next finite bin + extended_prob = _zero_mass( + values=extended_prob, mass=tail_truncation, from_left=True, exact=False + ) + shifted_mass = original_mass - math.fsum(map(float, extended_prob)) + extended_prob[np.nonzero(extended_prob)[0][0]] += shifted_mass + p_min_out = extended_prob[0] + # Truncate right tail and add its mass to to p_max + extended_prob = _zero_mass( + values=extended_prob, mass=tail_truncation, from_left=False, exact=False + ) + shifted_mass = original_mass - math.fsum(map(float, extended_prob)) + p_max_out = p_max + shifted_mass + prob_arr_out = extended_prob[1:] + elif bound_type == BoundType.IS_DOMINATED: + extended_prob = np.concatenate((trimmed_prob_arr, [p_max])) + original_mass = math.fsum(map(float, extended_prob)) + # Truncate right tail and add its mass to the next finite bin + extended_prob = _zero_mass( + values=extended_prob, mass=tail_truncation, from_left=False, exact=False + ) + shifted_mass = original_mass - math.fsum(map(float, extended_prob)) + extended_prob[np.nonzero(extended_prob)[0][-1]] += shifted_mass + p_max_out = extended_prob[-1] + # Truncate left tail and add its mass to to p_min + extended_prob = _zero_mass( + values=extended_prob, mass=tail_truncation, from_left=True, exact=False + ) + shifted_mass = original_mass - math.fsum(map(float, extended_prob)) + p_min_out = p_min + shifted_mass + prob_arr_out = extended_prob[:-1] + else: + raise ValueError(f"Unknown BoundType: {bound_type}") + + # Remove zero probability tails to reduce unnecessary computations + inner_min_new, inner_max_new = _strip_zero_edges(prob_arr_out) + min_ind_new = inner_min + inner_min_new + max_ind_new = inner_min + inner_max_new + return ( + prob_arr_out[slice(inner_min_new, inner_max_new + 1)].copy(), + p_min_out, + p_max_out, + min_ind_new, + max_ind_new, + ) + + +def _strip_zero_edges(prob_arr: NDArray[np.float64]) -> tuple[int, int]: + """Return (min_ind, max_ind) of the nonzero range in prob_arr. + + Raises ValueError if all mass is zero. + """ + nonzero_indices = np.nonzero(prob_arr)[0] + if nonzero_indices.size == 0: + raise ValueError("Cannot truncate distribution with zero finite mass") + return int(nonzero_indices[0]), int(nonzero_indices[-1]) + + +def _zero_mass( + *, + values: NDArray[np.float64], + mass: float, + from_left: bool, + exact: bool, +) -> NDArray[np.float64]: + """Remove mass probability from values from one of the side, based on ``from_left``. + + If ``exact`` is true, partially consume the pivot bin so that exactly + ``mass`` is removed. Otherwise, consume only complete bins whose cumulative + mass does not exceed ``mass`` and leave the pivot bin unchanged. + """ + if mass <= 0.0: + return values + total_mass = math.fsum(map(float, values)) + if mass >= total_mass: + raise ValueError( + "mass must be smaller than total array mass, " + f"got mass={mass:.12g}, total={total_mass:.12g}" + ) + + # When removing from the right, we just flip the array before and after the calculation + if not from_left: + values = values[::-1] + + # Find the pivot index + cumsum = np.cumsum(values, dtype=np.float64) + if exact: + pivot = int(np.searchsorted(cumsum, mass, side="left")) + else: + pivot = int(np.searchsorted(cumsum, mass, side="right")) + + # Remove the probability mass below the pivot + removed_before = float(cumsum[pivot - 1]) if pivot > 0 else 0.0 + if pivot > 0: + values[:pivot] = 0.0 + # Remove the additional probability mass from the pivot if needed + if exact: + values[pivot] = max(0.0, values[pivot] - (mass - removed_before)) + + if not from_left: + values = values[::-1] + return values + + +# ============================================================================= +# Public API: Continuous Distribution Discretization +# ============================================================================= + + +def discretize_continuous_distribution( + *, + dist: stats.rv_continuous | rv_frozen[Any, Any], + tail_truncation: float, + bound_type: BoundType, + spacing_type: SpacingType, + step: float, + align_to_multiples: bool, + domain: Domain = Domain.REALS, +) -> DenseDiscreteDist: + """Discretize a continuous distribution to a typed structured representation. + + Args: + dist: Continuous distribution to discretize. + tail_truncation: Tail mass budget used to define quantile bounds and bin increment floor. + bound_type: Tie-breaking direction for interval mass assignment. + spacing_type: Output grid spacing family (linear or geometric). + step: Linear bin width or geometric log-ratio. + align_to_multiples: Whether to align the quantile-derived bounds to integer step multiples. + domain: Domain semantics for boundary masses in the output discrete distribution. + + Returns: + Discretized distribution on a structured dense grid. + """ + + grid = _discretize_continuous_to_grid( + dist=dist, + tail_truncation=tail_truncation, + spacing_type=spacing_type, + step=step, + align_to_multiples=align_to_multiples, + ) + x_array = grid.x_array + if x_array[0] <= 0 and domain == Domain.POSITIVES: + dist_name = getattr(dist, "name", type(dist).__name__) + raise ValueError( + f"Cannot discretize {dist_name} to a positive range, got x_min={x_array[0]}" + ) + + # 2. Map density to PMF with semantics. + return discretize_continuous_grid( + dist=dist, + grid=grid, + bound_type=bound_type, + PMF_min_increment=tail_truncation, + domain=domain, + ) + + +def discretize_continuous_dist( + *, + dist: stats.rv_continuous | rv_frozen[Any, Any], + x_array: NDArray[np.float64], + bound_type: BoundType, + PMF_min_increment: float, + spacing_type: SpacingType, + domain: Domain = Domain.REALS, +) -> DenseDiscreteDist: + """Convert continuous distribution to discrete PMF with bounding semantics.""" + prob_arr, p_min, p_max = _discretize_continuous_prob_arr( + dist=dist, + x_array=x_array, + bound_type=bound_type, + PMF_min_increment=PMF_min_increment, + ) + + if spacing_type == SpacingType.LINEAR: + return DenseDiscreteDist.from_x_array( + x_array=x_array, + prob_arr=prob_arr, + p_min=p_min, + p_max=p_max, + domain=domain, + ) + + if spacing_type == SpacingType.GEOMETRIC: + return DenseDiscreteDist.from_x_array( + x_array=x_array, + prob_arr=prob_arr, + p_min=p_min, + p_max=p_max, + spacing_type=SpacingType.GEOMETRIC, + domain=Domain.POSITIVES, + ) + + raise ValueError(f"Invalid spacing_type: {spacing_type}") + + +def discretize_continuous_grid( + *, + dist: stats.rv_continuous | rv_frozen[Any, Any], + grid: RegularGrid, + bound_type: BoundType, + PMF_min_increment: float, + domain: Domain = Domain.REALS, +) -> DenseDiscreteDist: + """Convert continuous distribution to a discrete PMF on a known regular grid.""" + prob_arr, p_min, p_max = _discretize_continuous_prob_arr( + dist=dist, + x_array=grid.x_array, + bound_type=bound_type, + PMF_min_increment=PMF_min_increment, + ) + if grid.spacing_type == SpacingType.LINEAR: + return DenseDiscreteDist.from_grid( + grid=grid, + prob_arr=prob_arr, + p_min=p_min, + p_max=p_max, + domain=domain, + ) + if grid.spacing_type == SpacingType.GEOMETRIC: + return DenseDiscreteDist.from_grid( + grid=grid, + prob_arr=prob_arr, + p_min=p_min, + p_max=p_max, + domain=Domain.POSITIVES, + ) + raise ValueError(f"Invalid spacing_type: {grid.spacing_type}") + + +def _discretize_continuous_prob_arr( + *, + dist: stats.rv_continuous | rv_frozen[Any, Any], + x_array: NDArray[np.float64], + bound_type: BoundType, + PMF_min_increment: float, +) -> tuple[NDArray[np.float64], float, float]: + """Compute discrete PMF and boundary masses on a materialized grid.""" + bin_probs, p_left, p_right = _compute_discrete_prob( + dist=dist, x_array=x_array, bound_type=bound_type, PMF_min_increment=PMF_min_increment + ) + + n = x_array.size + prob_arr = np.zeros(n) + + if bound_type == BoundType.DOMINATES: + # Shift mass right: left tail (-inf, x0) -> x0, + # each interval [x_i, x_{i+1}) -> x_{i+1}, right tail (x_n, inf) -> inf. + prob_arr[0] = p_left + prob_arr[1:] = bin_probs + return prob_arr, 0.0, p_right + + if bound_type == BoundType.IS_DOMINATED: + # Shift mass left: left tail (-inf, x0) -> -inf, + # each interval [x_i, x_{i+1}) -> x_i, right tail (x_n, inf) -> x_n. + prob_arr[:-1] = bin_probs + prob_arr[-1] = p_right + return prob_arr, p_left, 0.0 + + raise ValueError(f"Unknown BoundType: {bound_type}") + + +def rediscretize_dist( + *, + dist: DiscreteDistBase, + tail_truncation: float, + loss_discretization: float, + spacing_type: SpacingType, + bound_type: BoundType, +) -> DenseDiscreteDist: + """Rediscretize a distribution onto a requested grid spacing. + + Remaps PMF onto a new grid with the requested spacing and discretization. + Implementation trims zero/tail regions, computes new grid size, then remaps + using domination-aware rounding (e.g., linear grids for dp_accounting output). + + Algorithm 6 (`disc-dist`) in Appendix C. + """ + + # Support for rediscritizing a dominating distribution into a dominated one and vice versa + working_dist = dist.copy() + if bound_type == BoundType.IS_DOMINATED and working_dist.p_max > 0.0: + working_dist.prob_arr[-1] += working_dist.p_max + working_dist.p_max = 0.0 + elif ( + bound_type == BoundType.DOMINATES + and spacing_type == SpacingType.LINEAR + and working_dist.p_min > 0.0 + ): + working_dist.prob_arr[0] += working_dist.p_min + working_dist.p_min = 0.0 + + # Quantile-truncation + trunc_dist = working_dist.truncate_edges( + tail_truncation=tail_truncation / 2, bound_type=bound_type + ) + + x_array = trunc_dist.x_array + x_min = x_array[0] + x_max = x_array[-1] + + grid_out = discretize_aligned_grid( + x_min=x_min, + x_max=x_max, + spacing_type=spacing_type, + align_to_multiples=True, + discretization=loss_discretization, + ) + x_array_out = grid_out.x_array + + prob_arr_out = rediscretize_prob( + x_array=x_array, + prob_arr=trunc_dist.prob_arr, + x_array_out=x_array_out, + dominates=(bound_type == BoundType.DOMINATES), + ) + + prob_arr_out, p_min, p_max = enforce_mass_conservation( + prob_arr=prob_arr_out, + expected_p_min=working_dist.p_min, + expected_p_max=working_dist.p_max, + bound_type=bound_type, + ) + + if spacing_type == SpacingType.LINEAR: + return DenseDiscreteDist.from_grid( + grid=grid_out, + prob_arr=prob_arr_out, + p_min=p_min, + p_max=p_max, + ) + + if spacing_type == SpacingType.GEOMETRIC: + return DenseDiscreteDist.from_grid( + grid=grid_out, + prob_arr=prob_arr_out, + p_min=p_min, + p_max=p_max, + domain=Domain.POSITIVES, + ) + + raise ValueError(f"Invalid spacing_type: {spacing_type}") + + +def discretize_aligned_grid( + *, + x_min: float, + x_max: float, + spacing_type: SpacingType, + align_to_multiples: bool, + discretization: float, +) -> RegularGrid: + """Return regular grid metadata covering [x_min, x_max]. + + Args: + x_min: Minimum value of the range. + x_max: Maximum value of the range. + spacing_type: Type of spacing (LINEAR or GEOMETRIC). + align_to_multiples: If True, align range to whole multiples of discretization. + If False, use x_min and x_max directly without alignment. + discretization: Grid spacing parameter (step size for LINEAR, log ratio for GEOMETRIC). + + """ + # Validate inputs + if spacing_type not in (SpacingType.GEOMETRIC, SpacingType.LINEAR): + raise ValueError(f"Unsupported spacing_type: {spacing_type}") + if x_max <= x_min: + raise ValueError(f"x_max must be greater than x_min, got x_min={x_min}, x_max={x_max}") + if spacing_type == SpacingType.GEOMETRIC and x_min <= 0: + raise ValueError( + f"Geometric spacing requires positive values, got x_min={x_min}, x_max={x_max}" + ) + + if discretization <= 0: + raise ValueError("discretization must be positive") + + d = float(discretization) + + if spacing_type == SpacingType.LINEAR: + if align_to_multiples: + k_lo = int(np.floor(x_min / d)) + k_hi = int(np.ceil(x_max / d)) + grid = RegularGrid( + x_min=float(d * k_lo), + step=d, + size=k_hi - k_lo + 1, + spacing_type=SpacingType.LINEAR, + ) + # It is possible that aligned bounds miss by a small float64 roundoff. + if grid.x_array[0] > x_min: + k_lo -= 1 + grid = RegularGrid( + x_min=float(d * k_lo), + step=d, + size=k_hi - k_lo + 1, + spacing_type=SpacingType.LINEAR, + ) + if grid.x_array[-1] < x_max: + k_hi += 1 + return RegularGrid( + x_min=float(d * k_lo), + step=d, + size=k_hi - k_lo + 1, + spacing_type=SpacingType.LINEAR, + ) + span = x_max - x_min + n = int(np.ceil(span / d)) + 1 + grid = RegularGrid( + x_min=float(x_min), + step=d, + size=n, + spacing_type=SpacingType.LINEAR, + ) + if grid.x_array[-1] < x_max: + n += 1 + return RegularGrid( + x_min=float(x_min), + step=d, + size=n, + spacing_type=SpacingType.LINEAR, + ) + + # GEOMETRIC: discretization is log-ratio per step; grid x = exp(d * k). + if align_to_multiples: + k_lo = int(np.floor(np.log(x_min) / d)) + k_hi = int(np.ceil(np.log(x_max) / d)) + grid = RegularGrid( + x_min=float(np.exp(d * k_lo)), + step=d, + size=k_hi - k_lo + 1, + spacing_type=SpacingType.GEOMETRIC, + ) + # It is possible that aligned bounds miss by a small float64 roundoff. + if grid.x_array[0] > x_min: + k_lo -= 1 + grid = RegularGrid( + x_min=float(np.exp(d * k_lo)), + step=d, + size=k_hi - k_lo + 1, + spacing_type=SpacingType.GEOMETRIC, + ) + if grid.x_array[-1] < x_max: + k_hi += 1 + return RegularGrid( + x_min=float(np.exp(d * k_lo)), + step=d, + size=k_hi - k_lo + 1, + spacing_type=SpacingType.GEOMETRIC, + ) + n = int(np.ceil(np.log(x_max / x_min) / d)) + 1 + grid = RegularGrid( + x_min=float(x_min), + step=d, + size=n, + spacing_type=SpacingType.GEOMETRIC, + ) + if grid.x_array[-1] < x_max: + n += 1 + return RegularGrid( + x_min=float(x_min), + step=d, + size=n, + spacing_type=SpacingType.GEOMETRIC, + ) + + +def discretize_aligned_range( + *, + x_min: float, + x_max: float, + spacing_type: SpacingType, + align_to_multiples: bool, + discretization: float, +) -> NDArray[np.float64]: + """Return a grid covering [x_min, x_max].""" + return discretize_aligned_grid( + x_min=x_min, + x_max=x_max, + spacing_type=spacing_type, + align_to_multiples=align_to_multiples, + discretization=discretization, + ).x_array + + +@njit(cache=True) +def rediscretize_prob( + x_array: NDArray[np.float64], + prob_arr: NDArray[np.float64], + x_array_out: NDArray[np.float64], + dominates: bool, +) -> NDArray[np.float64]: + """Remap PMF onto new grid with domination-aware rounding. + + Maps each probability mass to output grid position based on domination semantics. + Implementation: dominates=True uses ceil (pessimistic), False uses floor (optimistic). + Uses Kahan summation for numerical accuracy. + Returns: remapped PMF array. + + """ + n_out = x_array_out.size + prob_arr_out = np.zeros(n_out) + compensations = np.zeros(n_out) + + # single pointer into x_array_out since x_array is strictly increasing + j = 0 + + if dominates: + # ceil: bin = first index with x_array_out[j] >= z; overflow right -> p_max + for i in range(x_array.size): + z = x_array[i] + mass = prob_arr[i] + # Skip only zero-mass bins, not small-mass bins + if mass <= 0: + continue + + # advance while x_array_out[j] < z + while j < n_out and x_array_out[j] < z: + j += 1 + + if j >= n_out: + # overflow to the right: discard mass (goes to p_max via enforce_mass_conservation) + continue + # include values below x_array_out[0] in the first bin (ceil behavior) + y = mass - compensations[j] + t = prob_arr_out[j] + y + compensations[j] = (t - prob_arr_out[j]) - y + prob_arr_out[j] = t + + else: + # floor: bin = last index with x_array_out[j] <= z; underflow left -> p_min + for i in range(x_array.size): + z = x_array[i] + mass = prob_arr[i] + # Skip only zero-mass bins, not small-mass bins + if mass <= 0: + continue + + # advance while x_array_out[j] <= z + while j < n_out and x_array_out[j] <= z: + j += 1 + + idx = j - 1 + if idx < 0: + # underflow to the left: discard mass (goes to p_min via enforce_mass_conservation) + continue + # include values above x_array_out[-1] in the last bin (floor behavior) + y = mass - compensations[idx] + t = prob_arr_out[idx] + y + compensations[idx] = (t - prob_arr_out[idx]) - y + prob_arr_out[idx] = t + + return prob_arr_out + + +# ============================================================================= +# Helper Functions +# ============================================================================= + + +@njit(cache=True) +def _adaptive_bins_from_cdf( + *, + cdf: NDArray[np.float64], + tail_truncation: float, +) -> NDArray[np.float64]: + """Adaptive binning from CDF with mass accumulation. + + Accumulates mass from CDF increments until threshold is reached, then assigns + accumulated mass to current bin. All mass is conserved - no mass is discarded. + """ + n = cdf.size + bin_probs = np.zeros(n - 1, dtype=np.float64) + accumulated_mass = 0.0 + + for i in range(n - 1): + # Current increment in CDF + current_increment = cdf[i + 1] - cdf[i] + accumulated_mass += current_increment + + if accumulated_mass >= tail_truncation: + # Assign accumulated mass to this bin + bin_probs[i] = accumulated_mass + accumulated_mass = 0.0 + + # Assign any remaining accumulated mass to the last bin + if accumulated_mass > 0.0: + bin_probs[n - 2] += accumulated_mass + + return bin_probs + + +@njit(cache=True) +def _adaptive_bins_from_sf( + *, + sf: NDArray[np.float64], + tail_truncation: float, +) -> NDArray[np.float64]: + """Adaptive binning from survival function with mass accumulation. + + Accumulates mass from SF increments until threshold is reached, then assigns + accumulated mass to current bin. All mass is conserved - no mass is discarded. + Processes from right to left (high to low x values). + """ + n = sf.size + bin_probs = np.zeros(n - 1, dtype=np.float64) + accumulated_mass = 0.0 + + for i in range(n - 2, -1, -1): + # Current increment in SF (going backwards) + current_increment = sf[i] - sf[i + 1] + accumulated_mass += current_increment + + if accumulated_mass >= tail_truncation: + # Assign accumulated mass to this bin + bin_probs[i] = accumulated_mass + accumulated_mass = 0.0 + + # Assign any remaining accumulated mass to the first bin + if accumulated_mass > 0.0: + bin_probs[0] += accumulated_mass + + return bin_probs + + +def _stable_cdf_and_sf( + *, + dist: stats.rv_continuous | rv_frozen[Any, Any], + x_array: NDArray[np.float64], +) -> tuple[NDArray[np.float64], NDArray[np.float64]]: + median = dist.median() + cdf = np.empty_like(x_array, dtype=np.float64) + sf = np.empty_like(x_array, dtype=np.float64) + + mask_left = x_array < median + if np.any(mask_left): + logcdf_vals = dist.logcdf(x_array[mask_left]) + cdf[mask_left] = np.exp(logcdf_vals) + sf[mask_left] = -np.expm1(logcdf_vals) + + mask_right = ~mask_left + if np.any(mask_right): + logsf_vals = dist.logsf(x_array[mask_right]) + sf[mask_right] = np.exp(logsf_vals) + cdf[mask_right] = -np.expm1(logsf_vals) + + cdf = np.clip(cdf, 0.0, 1.0) + sf = np.clip(sf, 0.0, 1.0) + return cdf, sf + + +def _compute_discrete_prob( + *, + dist: stats.rv_continuous | rv_frozen[Any, Any], + x_array: NDArray[np.float64], + bound_type: BoundType, + PMF_min_increment: float, +) -> tuple[NDArray[np.float64], float, float]: + """Compute bin probabilities using adaptive CDF/SF increments with logcdf/logsf stability. + + PMF_min_increment controls the minimum CDF/SF increment that becomes a bin mass. + + """ + cdf, sf = _stable_cdf_and_sf( + dist=dist, + x_array=x_array, + ) + p_left = cdf[0] + p_right = sf[-1] + PMF_min_increment = max(0.0, PMF_min_increment) + + if bound_type == BoundType.DOMINATES: + # Suppress intermediate debug logging. + bin_probs = _adaptive_bins_from_cdf( + cdf=cdf, + tail_truncation=PMF_min_increment, + ) + elif bound_type == BoundType.IS_DOMINATED: + # Suppress intermediate debug logging. + bin_probs = _adaptive_bins_from_sf( + sf=sf, + tail_truncation=PMF_min_increment, + ) + else: + raise ValueError(f"Unknown BoundType: {bound_type}") + + return bin_probs, p_left, p_right + + +def _discretize_continuous_to_grid( + *, + dist: stats.rv_continuous | rv_frozen[Any, Any], + tail_truncation: float, + spacing_type: SpacingType, + step: float, + align_to_multiples: bool, +) -> RegularGrid: + """Generate grid covering the quantile range defined by tail_truncation.""" + # Determine support bounds via quantiles + x_min = float(dist.ppf(tail_truncation)) + x_max = float(dist.isf(tail_truncation)) + if not np.isfinite(x_min) or not np.isfinite(x_max): + raise ValueError(f"Quantiles not finite: x_min={x_min}, x_max={x_max}") + + if spacing_type == SpacingType.GEOMETRIC: + if step <= 0.0: + raise ValueError(f"Geometric step must be positive, got {step}") + discretization = float(step) + else: + if step <= 0.0: + raise ValueError(f"Linear step must be positive, got {step}") + discretization = float(step) + + return discretize_aligned_grid( + x_min=x_min, + x_max=x_max, + spacing_type=spacing_type, + align_to_multiples=align_to_multiples, + discretization=discretization, + ) diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_distributions_test.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_distributions_test.py new file mode 100644 index 00000000..8d33c3d5 --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_distributions_test.py @@ -0,0 +1,562 @@ +""" +Core unit tests for random-allocation internals. + +Includes distribution discretization coverage and external interface conversion checks. +""" + +from __future__ import annotations + +import math + +import numpy as np +import pytest +from .random_allocation_distributions import ( + DenseDiscreteDist, + Domain, + PLDRealization, + _compute_discrete_prob as compute_discrete_PMF, + _zero_mass, + compute_bin_log_ratio, + compute_bin_width, + compute_truncation, + discretize_aligned_grid, + discretize_aligned_range, + discretize_continuous_distribution, + enforce_mass_conservation, + rediscretize_dist, + rediscretize_prob as pmf_remap_to_grid_kernel, +) +from .random_allocation_core import linear_dist_to_dp_accounting_pmf +from .random_allocation_types import BoundType, RegularGrid, SpacingType +from .random_allocation_utils import exp_linear_to_geometric, log_geometric_to_linear +from scipy import stats + + +def _make_realization() -> PLDRealization: + return PLDRealization( + x_min=-0.5, + step=0.5, + prob_arr=np.array([0.2, 0.3, 0.25, 0.15], dtype=np.float64), + p_max=0.1, + ) + + +def test_linear_dist_to_dp_accounting_preserves_shape_and_infinity_mass(): + original = _make_realization() + pmf = linear_dist_to_dp_accounting_pmf(dist=original, pessimistic_estimate=True) + + assert pmf._probs.shape == original.prob_arr.shape + assert pmf._infinity_mass == original.p_max + assert pmf._discretization == original.step + + +def test_linear_dist_to_dp_accounting_handles_zero_finite_mass(): + realization = DenseDiscreteDist( + x_min=0.0, + step=1.0, + prob_arr=np.array([0.0, 0.0], dtype=np.float64), + p_max=1.0, + ) + pmf = linear_dist_to_dp_accounting_pmf(dist=realization, pessimistic_estimate=True) + assert pmf._infinity_mass == 1.0 + assert np.allclose(pmf._probs, np.array([0.0, 0.0])) + + +def test_exp_log_round_trip_preserves_linear_step(): + original = DenseDiscreteDist( + x_min=0.0, + step=1e-4, + prob_arr=np.array([0.4, 0.6], dtype=np.float64), + ) + + round_tripped = log_geometric_to_linear(exp_linear_to_geometric(original)) + + assert round_tripped.step == original.step + + +def test_dense_dist_keeps_regular_grid_as_source_of_truth(): + grid = RegularGrid( + x_min=1.0, + step=1e-4, + size=2, + spacing_type=SpacingType.GEOMETRIC, + ) + dist = DenseDiscreteDist.from_grid( + grid=grid, + prob_arr=np.array([0.4, 0.6], dtype=np.float64), + domain=Domain.POSITIVES, + ) + + assert dist.grid == grid + assert dist.step == 1e-4 + + +class TestDiscritizeRange: + """Test discretize_aligned_range function.""" + + def test_linear_spacing(self): + n_grid = 100 + x = discretize_aligned_range( + x_min=0.0, + x_max=10.0, + spacing_type=SpacingType.LINEAR, + align_to_multiples=True, + discretization=(10.0 - 0.0) / (n_grid - 1), + ) + assert len(x) >= n_grid + assert x[0] <= 0.0 + assert x[-1] >= 10.0 + diffs = np.diff(x) + assert np.allclose(diffs, diffs[0]) + + def test_geometric_spacing(self): + n_grid = 100 + x = discretize_aligned_range( + x_min=1.0, + x_max=100.0, + spacing_type=SpacingType.GEOMETRIC, + align_to_multiples=True, + discretization=np.log(100.0 / 1.0) / (n_grid - 1), + ) + assert len(x) >= n_grid + assert x[0] <= 1.0 + assert x[-1] >= 100.0 + ratios = x[1:] / x[:-1] + assert np.allclose(ratios, ratios[0]) + + def test_nonpositive_discretization_rejected(self): + with pytest.raises(ValueError, match="discretization must be positive"): + discretize_aligned_range( + x_min=0.0, + x_max=10.0, + spacing_type=SpacingType.LINEAR, + align_to_multiples=True, + discretization=0.0, + ) + + def test_two_points_linear(self): + n_grid = 100 + x = discretize_aligned_range( + x_min=1.0, + x_max=3.0, + spacing_type=SpacingType.LINEAR, + align_to_multiples=True, + discretization=(3.0 - 1.0) / (n_grid - 1), + ) + assert len(x) >= n_grid + assert x[0] <= 1.0 + assert x[-1] >= 3.0 + diffs = np.diff(x) + assert np.allclose(diffs, diffs[0]) + + def test_linear_spacing_covers_endpoint_after_alignment_rounding(self): + x_min = -1.411426541779732 + x_max = 1.4160541697856062 + discretization = 0.041648652052517825 + + x = discretize_aligned_range( + x_min=x_min, + x_max=x_max, + spacing_type=SpacingType.LINEAR, + align_to_multiples=True, + discretization=discretization, + ) + + assert x[0] <= x_min + assert x[-1] >= x_max + diffs = np.diff(x) + assert np.allclose(diffs, diffs[0]) + + def test_linear_aligned_spacing_matches_requested_step(self): + discretization = 0.25 + x = discretize_aligned_range( + x_min=-1.12, + x_max=2.18, + spacing_type=SpacingType.LINEAR, + align_to_multiples=True, + discretization=discretization, + ) + + assert np.isclose(compute_bin_width(x), discretization) + assert np.allclose(x / discretization, np.round(x / discretization)) + + def test_continuous_discretization_uses_requested_linear_step(self): + result = discretize_continuous_distribution( + dist=stats.norm(loc=0.0, scale=1.0), + tail_truncation=1e-3, + bound_type=BoundType.DOMINATES, + spacing_type=SpacingType.LINEAR, + step=0.1, + align_to_multiples=True, + ) + + assert np.isclose(compute_bin_width(result.x_array), 0.1) + + def test_continuous_discretization_uses_requested_geometric_step(self): + step = np.log(1.05) + result = discretize_continuous_distribution( + dist=stats.lognorm(s=0.5, scale=1.0), + tail_truncation=1e-3, + bound_type=BoundType.DOMINATES, + spacing_type=SpacingType.GEOMETRIC, + step=step, + align_to_multiples=True, + ) + + assert result.step == step + assert np.isclose(np.exp(result.step), 1.05) + + def test_generated_geometric_grid_preserves_pld_default_step(self): + grid = discretize_aligned_grid( + x_min=0.1, + x_max=10.0, + spacing_type=SpacingType.GEOMETRIC, + align_to_multiples=True, + discretization=1e-4, + ) + x = grid.x_array + + dist = DenseDiscreteDist( + x_min=grid.x_min, + step=grid.step, + prob_arr=np.full(x.size, 1.0 / x.size), + spacing_type=SpacingType.GEOMETRIC, + domain=Domain.POSITIVES, + ) + + assert dist.step == 1e-4 + assert np.allclose(dist.x_array, x) + + +class TestComputeBinWidth: + def test_uniform_grid(self): + x = np.array([1.0, 2.0, 3.0, 4.0]) + width = compute_bin_width(x) + assert np.isclose(width, 1.0) + + def test_nonuniform_grid_raises(self): + x = np.array([1.0, 2.0, 3.5, 6.0]) + with pytest.raises(ValueError, match="non-uniform bin widths"): + compute_bin_width(x) + + def test_single_point_raises(self): + x = np.array([1.0]) + with pytest.raises(ValueError, match="less than 2 bins"): + compute_bin_width(x) + + +class TestComputeBinLogRatio: + def test_geometric_grid(self): + x = np.array([1.0, 2.0, 4.0, 8.0]) + step = compute_bin_log_ratio(x) + assert np.isclose(step, np.log(2.0)) + + def test_nonuniform_grid_raises(self): + x = np.array([1.0, 3.0, 6.0, 18.0]) + with pytest.raises(ValueError, match="non-uniform bin widths"): + compute_bin_log_ratio(x) + + def test_single_point_raises(self): + x = np.array([1.0]) + with pytest.raises(ValueError, match="less than 2 bins"): + compute_bin_log_ratio(x) + + +class TestComputeDiscretePMF: + def test_uniform_distribution(self): + dist = stats.uniform(loc=0.0, scale=1.0) + x_array = np.linspace(0.0, 1.0, 11) + bin_prob, p_left, p_right = compute_discrete_PMF( + dist=dist, x_array=x_array, bound_type=BoundType.DOMINATES, PMF_min_increment=0.0 + ) + + assert len(bin_prob) == 10 + assert np.all(bin_prob >= 0) + assert np.allclose(bin_prob, 0.1, atol=0.01) + assert p_left < 0.01 + assert p_right < 0.01 + + def test_normal_distribution(self): + dist = stats.norm(loc=0.0, scale=1.0) + x_array = np.linspace(-3.0, 3.0, 1001) + bin_prob, p_left, p_right = compute_discrete_PMF( + dist=dist, x_array=x_array, bound_type=BoundType.DOMINATES, PMF_min_increment=0.0 + ) + + total = math.fsum([*map(float, bin_prob), p_left, p_right]) + assert np.isclose(total, 1.0, atol=1e-10) + + def test_exponential_distribution(self): + dist = stats.expon(scale=1.0) + x_array = np.linspace(0.0, 5.0, 51) + _bin_prob, p_left, p_right = compute_discrete_PMF( + dist=dist, x_array=x_array, bound_type=BoundType.DOMINATES, PMF_min_increment=0.0 + ) + + assert p_left < 0.01 + assert p_right > 0.0 + + +class TestPMFRemapToGrid: + def test_exact_alignment(self): + x_in = np.array([1.0, 2.0, 3.0]) + pmf_in = np.array([0.2, 0.5, 0.3], dtype=np.float64) + x_out = x_in.copy() + + pmf_out = pmf_remap_to_grid_kernel(x_in, pmf_in, x_out, dominates=True) + assert np.allclose(pmf_out, pmf_in) + + def test_dominates_rounding(self): + x_in = np.array([1.0, 2.5, 4.0]) + pmf_in = np.array([0.3, 0.4, 0.3], dtype=np.float64) + x_out = np.array([1.0, 2.0, 3.0, 4.0]) + + pmf_out = pmf_remap_to_grid_kernel(x_in, pmf_in, x_out, dominates=True) + assert pmf_out[2] >= 0.4 + + def test_is_dominated_rounding(self): + x_in = np.array([1.0, 2.5, 4.0]) + pmf_in = np.array([0.3, 0.4, 0.3], dtype=np.float64) + x_out = np.array([1.0, 2.0, 3.0, 4.0]) + + pmf_out = pmf_remap_to_grid_kernel(x_in, pmf_in, x_out, dominates=False) + assert pmf_out[1] >= 0.4 + + def test_overflow_to_infinity(self): + x_in = np.array([1.0, 2.0, 5.0]) + pmf_in = np.array([0.3, 0.4, 0.3], dtype=np.float64) + x_out = np.array([1.0, 2.0, 3.0]) + + pmf_out = pmf_remap_to_grid_kernel(x_in, pmf_in, x_out, dominates=True) + _, _, ppos = enforce_mass_conservation( + prob_arr=pmf_out, expected_p_min=0.0, expected_p_max=0.0, bound_type=BoundType.DOMINATES + ) + assert ppos >= 0.3 + + def test_mass_conservation_in_remap(self): + x_in = np.array([0.5, 1.5, 2.5, 3.5]) + pmf_in = np.array([0.1, 0.3, 0.4, 0.2], dtype=np.float64) + x_out = np.array([1.0, 2.0, 3.0]) + + pmf_out = pmf_remap_to_grid_kernel(x_in, pmf_in, x_out, dominates=True) + total_in = math.fsum(map(float, pmf_in)) + pmf_out, pneg, ppos = enforce_mass_conservation( + prob_arr=pmf_out, expected_p_min=0.0, expected_p_max=0.0, bound_type=BoundType.DOMINATES + ) + total_out = math.fsum([*map(float, pmf_out), pneg, ppos]) + assert np.isclose(total_in, total_out, atol=1e-10) + + +class TestEnforceMassConservation: + def test_dominates_can_consume_soft_p_min(self): + prob_arr = np.array([0.4, 0.1], dtype=np.float64) + prob_out, p_min, p_max = enforce_mass_conservation( + prob_arr=prob_arr, + expected_p_min=0.3, + expected_p_max=0.4, + bound_type=BoundType.DOMINATES, + ) + + assert np.allclose(prob_out, np.array([0.4, 0.1])) + assert np.isclose(p_min, 0.1) + assert np.isclose(p_max, 0.4) + assert np.isclose(math.fsum([*map(float, prob_out), p_min, p_max]), 1.0) + + def test_is_dominated_can_consume_soft_p_max(self): + prob_arr = np.array([0.1, 0.4], dtype=np.float64) + prob_out, p_min, p_max = enforce_mass_conservation( + prob_arr=prob_arr, + expected_p_min=0.4, + expected_p_max=0.3, + bound_type=BoundType.IS_DOMINATED, + ) + + assert np.allclose(prob_out, np.array([0.1, 0.4])) + assert np.isclose(p_min, 0.4) + assert np.isclose(p_max, 0.1) + assert np.isclose(math.fsum([*map(float, prob_out), p_min, p_max]), 1.0) + + +class TestComputeTruncation: + def test_strips_zero_edges_before_tail_truncation(self): + new_prob_arr, new_p_min, new_p_max, min_ind, max_ind = compute_truncation( + prob_arr=np.array([0.0, 0.8], dtype=np.float64), + p_min=0.0, + p_max=0.2, + tail_truncation=0.1, + bound_type=BoundType.DOMINATES, + ) + + assert np.allclose(new_prob_arr, np.array([0.8], dtype=np.float64)) + assert np.isclose(new_p_min, 0.0) + assert np.isclose(new_p_max, 0.2) + assert (min_ind, max_ind) == (1, 1) + + def test_keeps_boundary_when_it_is_the_first_remaining_element(self): + new_prob_arr, new_p_min, new_p_max, min_ind, max_ind = compute_truncation( + prob_arr=np.array([0.0, 0.2, 0.5], dtype=np.float64), + p_min=0.3, + p_max=0.0, + tail_truncation=0.1, + bound_type=BoundType.DOMINATES, + ) + + assert np.allclose(new_prob_arr, np.array([0.2, 0.5], dtype=np.float64)) + assert np.isclose(new_p_min, 0.3) + assert np.isclose(new_p_max, 0.0) + assert (min_ind, max_ind) == (1, 2) + + def test_truncation_folds_consumed_boundary_into_first_finite_bin(self): + new_prob_arr, new_p_min, new_p_max, min_ind, max_ind = compute_truncation( + prob_arr=np.array([0.2, 0.75], dtype=np.float64), + p_min=0.05, + p_max=0.0, + tail_truncation=0.1, + bound_type=BoundType.DOMINATES, + ) + + assert np.allclose(new_prob_arr, np.array([0.25, 0.75], dtype=np.float64)) + assert np.isclose(new_p_min, 0.0) + assert np.isclose(new_p_max, 0.0) + assert (min_ind, max_ind) == (0, 1) + + def test_strips_zero_edges_for_is_dominated_right_tail(self): + new_prob_arr, new_p_min, new_p_max, min_ind, max_ind = compute_truncation( + prob_arr=np.array([0.8, 0.0], dtype=np.float64), + p_min=0.2, + p_max=0.0, + tail_truncation=0.1, + bound_type=BoundType.IS_DOMINATED, + ) + + assert np.allclose(new_prob_arr, np.array([0.8], dtype=np.float64)) + assert np.isclose(new_p_min, 0.2) + assert np.isclose(new_p_max, 0.0) + assert (min_ind, max_ind) == (0, 0) + + def test_dense_truncate_edges_updates_x_min_after_zero_edge_removal(self): + dist = DenseDiscreteDist( + x_min=0.0, + step=1.0, + prob_arr=np.array([0.0, 0.8], dtype=np.float64), + p_max=0.2, + ) + + result = dist.truncate_edges(0.1, BoundType.DOMINATES) + + assert np.allclose(result.x_array, np.array([1.0], dtype=np.float64)) + assert np.allclose(result.prob_arr, np.array([0.8], dtype=np.float64)) + assert np.isclose(result.p_min, 0.0) + assert np.isclose(result.p_max, 0.2) + + def test_dense_truncate_edges_removes_right_tail_under_truncation_budget(self): + dist = DenseDiscreteDist( + x_min=1.0, + step=1.0, + prob_arr=np.array([0.8, 0.1, 0.1], dtype=np.float64), + ) + + result = dist.truncate_edges(0.15, BoundType.DOMINATES) + + assert np.allclose(result.x_array, np.array([1.0, 2.0], dtype=np.float64)) + assert np.allclose(result.prob_arr, np.array([0.8, 0.1], dtype=np.float64)) + assert np.isclose(result.p_min, 0.0) + assert np.isclose(result.p_max, 0.1) + + +class TestZeroMass: + def test_raises_when_mass_is_at_least_total(self): + with pytest.raises(ValueError, match="mass must be smaller than total array mass"): + _zero_mass( + values=np.array([0.2, 0.8], dtype=np.float64), + mass=1.0, + from_left=True, + exact=True, + ) + + +class TestRediscretizeBoundaryFolding: + def test_rediscretize_near_point_mass_distribution(self): + dist = DenseDiscreteDist( + x_min=0.5, + step=0.5, + prob_arr=np.array([1.0 - 1e-6, 1e-6], dtype=np.float64), + ) + + result = rediscretize_dist( + dist=dist, + tail_truncation=1e-8, + loss_discretization=1e-2, + spacing_type=SpacingType.LINEAR, + bound_type=BoundType.DOMINATES, + ) + + assert np.isclose(result.step, 1e-2) + assert np.isclose(result.x_array[0], 0.5) + assert np.isclose(result.x_array[-1], 1.0) + total = math.fsum([*map(float, result.prob_arr), result.p_min, result.p_max]) + assert np.isclose(total, 1.0) + + def test_is_dominated_moves_p_max_into_last_finite_cell(self): + dist = DenseDiscreteDist.from_x_array( + x_array=np.array([0.0, 1.0, 2.0], dtype=np.float64), + prob_arr=np.array([0.2, 0.3, 0.4], dtype=np.float64), + p_max=0.1, + ) + + result = rediscretize_dist( + dist=dist, + tail_truncation=0.0, + loss_discretization=1.0, + spacing_type=SpacingType.LINEAR, + bound_type=BoundType.IS_DOMINATED, + ) + + assert np.isclose(result.p_max, 0.0) + assert np.isclose(result.prob_arr[-1], 0.5) + assert np.isclose( + math.fsum([*map(float, result.prob_arr), result.p_min, result.p_max]), 1.0 + ) + + def test_dominates_linear_moves_p_min_into_first_finite_cell(self): + dist = DenseDiscreteDist.from_x_array( + x_array=np.array([0.0, 1.0, 2.0], dtype=np.float64), + prob_arr=np.array([0.2, 0.3, 0.4], dtype=np.float64), + p_min=0.1, + ) + + result = rediscretize_dist( + dist=dist, + tail_truncation=0.0, + loss_discretization=1.0, + spacing_type=SpacingType.LINEAR, + bound_type=BoundType.DOMINATES, + ) + + assert np.isclose(result.p_min, 0.0) + assert np.isclose(result.prob_arr[0], 0.3) + assert np.isclose( + math.fsum([*map(float, result.prob_arr), result.p_min, result.p_max]), 1.0 + ) + + def test_dominates_geometric_keeps_zero_atom(self): + dist = DenseDiscreteDist.from_x_array( + x_array=np.array([1.0, 2.0, 4.0], dtype=np.float64), + prob_arr=np.array([0.2, 0.3, 0.4], dtype=np.float64), + p_min=0.1, + spacing_type=SpacingType.GEOMETRIC, + domain=Domain.POSITIVES, + ) + + result = rediscretize_dist( + dist=dist, + tail_truncation=0.0, + loss_discretization=np.log(2.0), + spacing_type=SpacingType.GEOMETRIC, + bound_type=BoundType.DOMINATES, + ) + + assert np.isclose(result.p_min, 0.1) + assert np.isclose( + math.fsum([*map(float, result.prob_arr), result.p_min, result.p_max]), 1.0 + ) diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_types.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_types.py new file mode 100644 index 00000000..e4cd9db2 --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_types.py @@ -0,0 +1,112 @@ +"""Core type definitions and low-level PMF validation for random-allocation.""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum + +import numpy as np +from numpy.typing import NDArray + +# ============================================================================= +# Discrete Distribution Types +# ============================================================================= + + +class BoundType(Enum): + """Tie-breaking bound_type for discretization.""" + + DOMINATES = "DOMINATES" + IS_DOMINATED = "IS_DOMINATED" + + +class SpacingType(Enum): + """Grid spacing_type strategy.""" + + LINEAR = "linear" + GEOMETRIC = "geometric" + + +@dataclass(frozen=True) +class RegularGrid: + """Regular grid metadata with exact spacing semantics.""" + + x_min: float + step: float + size: int + spacing_type: SpacingType + + @property + def x_array(self) -> NDArray[np.float64]: + indices = np.arange(self.size, dtype=np.float64) + if self.spacing_type == SpacingType.LINEAR: + return self.x_min + self.step * indices + if self.spacing_type == SpacingType.GEOMETRIC: + return self.x_min * np.exp(self.step * indices) + raise ValueError(f"Unknown SpacingType: {self.spacing_type}") + + +class Direction(Enum): + """Enum for direction of privacy analysis.""" + + ADD = "add" + REMOVE = "remove" + + +# Defaults for AllocationSchemeConfig (independent of REALIZATION_MOMENT_TOL; +# tail budget is a modeling choice). +DEFAULT_LOSS_DISCRETIZATION = 1e-4 +DEFAULT_TAIL_TRUNCATION = 1e-12 + + +@dataclass +class PrivacyParams: + """Parameters common to all privacy schemes.""" + + sigma: float + num_steps: int + num_selected: int = 1 + num_epochs: int = 1 + epsilon: float | None = None + delta: float | None = None + + +@dataclass +class AllocationSchemeConfig: + """Configuration for privacy schemes.""" + + loss_discretization: float = DEFAULT_LOSS_DISCRETIZATION + tail_truncation: float = DEFAULT_TAIL_TRUNCATION + max_grid_mult: int = -1 # -1 means no upper limit on grid size + + +# ============================================================================= +# Discrete PMF validation +# ============================================================================= + + +def validate_discrete_pmf_and_boundaries( + prob_arr: NDArray[np.float64], + p_min: float, + p_max: float, +) -> None: + """Validate 1-D nonnegative PMF and nonnegative boundary masses. + + Args: + prob_arr: Finite-support probability masses. + p_min: Lower boundary mass (e.g. mass at ``-∞`` or 0). + p_max: Upper boundary mass (e.g. mass at ``+∞``). + + Raises: + ValueError: If shape or nonnegativity checks fail. + + """ + prob_arr = np.asarray(prob_arr, dtype=np.float64) + if prob_arr.ndim != 1: + raise ValueError("PMF must be 1-D array") + if np.any(prob_arr < 0.0): + raise ValueError("PMF must be nonnegative") + if p_min < 0.0: + raise ValueError(f"min must be nonnegative, got {p_min:.2e}") + if p_max < 0.0: + raise ValueError(f"max must be nonnegative, got {p_max:.2e}") diff --git a/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_utils.py b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_utils.py new file mode 100644 index 00000000..3ea08347 --- /dev/null +++ b/python/dp_accounting/dp_accounting/pld/random_allocation/random_allocation_utils.py @@ -0,0 +1,555 @@ +"""Utility functions for distribution operations and numerical stability.""" + +from __future__ import annotations + +import math +from typing import Any, Callable + +import numpy as np +from numba import njit +from numpy.typing import NDArray + +from .random_allocation_distributions import ( + DenseDiscreteDist, + DiscreteDistBase, + Domain, + PLDRealization, + SparseDiscreteDist, + enforce_mass_conservation, + stable_array_equal, +) +from .random_allocation_types import ( + AllocationSchemeConfig, + BoundType, + PrivacyParams, + SpacingType, +) + + +# ============================================================================= +# Boundary-Mass Convolution Utilities +# ============================================================================= + + +def convolve_boundary_masses( + p_min_1: float, + p_max_1: float, + p_min_2: float, + p_max_2: float, + domain: Domain, +) -> tuple[float, float]: + """Compute boundary masses (p_min, p_max) for the convolution Z = X + Y. + + Domain semantics differ for the lower boundary: + - REALS (−∞ absorbing): P(Z=−∞) = 1 − (1−p_min_1)(1−p_min_2) + - POSITIVES (0 neutral): P(Z=0) = p_min_1 · p_min_2 + + The upper boundary (+∞ absorbing) uses the same formula for both domains: + P(Z=+∞) = 1 − (1−p_max_1)(1−p_max_2) + + Both inputs must share the same domain. + """ + + # p_max: +∞ is always absorbing + p_max = float(np.clip(-np.expm1(np.log1p(-p_max_1) + np.log1p(-p_max_2)), 0.0, 1.0)) + + # p_min: depends on domain + if domain == Domain.POSITIVES: + # 0 is neutral: Z=0 only when both X=0 and Y=0 + p_min = p_min_1 * p_min_2 + else: + # −∞ is absorbing: Z=−∞ when either X=−∞ or Y=−∞ + p_min = float(np.clip(-np.expm1(np.log1p(-p_min_1) + np.log1p(-p_min_2)), 0.0, 1.0)) + + return p_min, p_max + + +def self_convolve_boundary_masses( + dist: DiscreteDistBase, + num_convolutions: int, +) -> tuple[float, float]: + """Compute boundary masses after ``num_convolutions`` self-convolutions.""" + # p_max: absorbing in both domains + p_max = float(np.clip(-np.expm1(num_convolutions * np.log1p(-dist.p_max)), 0.0, 1.0)) + + if dist.domain == Domain.POSITIVES: + # 0 is neutral: Z=0 only when all num_convolutions factors are 0 + p_min = dist.p_min**num_convolutions + else: + # −∞ is absorbing: Z=−∞ when any copy is −∞ + p_min = float(np.clip(-np.expm1(num_convolutions * np.log1p(-dist.p_min)), 0.0, 1.0)) + + return p_min, p_max + + +# ============================================================================= +# Public Convolution Utilities +# ============================================================================= + + +def binary_self_convolve( + *, + dist: DenseDiscreteDist, + T: int, + tail_truncation: float, + bound_type: BoundType, + convolve: Callable[..., DenseDiscreteDist], +) -> DenseDiscreteDist: + """Exponentiation by squaring-based self-convolution using a provided convolve function. + + Algorithm 3 (`self-conv`) in Appendix C. + """ + if T < 1: + raise ValueError(f"T must be >= 1, got {T}") + if T == 1: + return dist + + base_dist = dist + acc_dist = None + tail_truncation /= 4 + while T > 0: + if T & 1: + if acc_dist is None: + acc_dist = base_dist + else: + acc_dist = convolve( + dist_1=acc_dist, + dist_2=base_dist, + tail_truncation=tail_truncation / T, + bound_type=bound_type, + ) + T >>= 1 + if T > 0: + base_dist = convolve( + dist_1=base_dist, + dist_2=base_dist, + tail_truncation=tail_truncation / T, + bound_type=bound_type, + ) + # If T is a power of two, acc_dist is never set; return the final squared base_dist. + return acc_dist if acc_dist is not None else base_dist + + +def combine_distributions( + *, dist_1: DiscreteDistBase, dist_2: DiscreteDistBase, bound_type: BoundType +) -> SparseDiscreteDist: + """Combine two distributions by tightening bounds via CCDF min/max. + + For DOMINATES: returns tighter dominating distribution using pointwise min CCDF. + For IS_DOMINATED: returns tighter dominated distribution using pointwise max CCDF. + """ + ccdf_op: Any + if bound_type == BoundType.DOMINATES: + ccdf_op = np.minimum + elif bound_type == BoundType.IS_DOMINATED: + ccdf_op = np.maximum + else: + raise ValueError(f"Unknown BoundType: {bound_type}") + + if stable_array_equal(a=dist_1.x_array, b=dist_2.x_array): + dist_1_aligned, dist_2_aligned = dist_1, dist_2 + else: + dist_1_aligned, dist_2_aligned = _align_distributions_to_union_grid( + dist_1=dist_1, + dist_2=dist_2, + ) + + x_array = dist_1_aligned.x_array + ccdf_1 = _ccdf_from_pmf(dist_1_aligned) + ccdf_2 = _ccdf_from_pmf(dist_2_aligned) + combined_ccdf = ccdf_op(ccdf_1, ccdf_2) + prob_arr = combined_ccdf[:-2] - combined_ccdf[1:-1] + + prob_arr, p_min, p_max = enforce_mass_conservation( + prob_arr=prob_arr, + expected_p_min=max(dist_1_aligned.p_min, dist_2_aligned.p_min), + expected_p_max=max(dist_1_aligned.p_max, dist_2_aligned.p_max), + bound_type=bound_type, + ) + + return SparseDiscreteDist( + x_array=x_array, + prob_arr=prob_arr, + p_min=p_min, + p_max=p_max, + ) + + +# ============================================================================= +# Distribution Transform Utilities +# ============================================================================= + + +def exp_linear_to_geometric(dist: DenseDiscreteDist) -> DenseDiscreteDist: + """Apply exp(.) to a linear-grid distribution, producing a geometric-grid distribution. + + Maps REALS domain → POSITIVES domain. + The −∞ atom (p_min in REALS) maps to the 0 atom (p_min in POSITIVES). + """ + if dist.spacing_type != SpacingType.LINEAR: + raise ValueError( + f"exp_linear_to_geometric requires LINEAR spacing input, got {dist.spacing_type}" + ) + x_min_exp = float(np.exp(dist.x_min)) + + return DenseDiscreteDist( + x_min=x_min_exp, + step=dist.step, + prob_arr=dist.prob_arr.copy(), + p_min=dist.p_min, # −∞ atom → 0 atom (p_min identity preserved) + p_max=dist.p_max, # +∞ atom unchanged + spacing_type=SpacingType.GEOMETRIC, + domain=Domain.POSITIVES, + ) + + +def log_geometric_to_linear(dist: DenseDiscreteDist) -> DenseDiscreteDist: + """Apply log(.) to a geometric-grid distribution, producing a linear-grid distribution. + + Maps POSITIVES domain → REALS domain. + The 0 atom (p_min in POSITIVES) maps to the −∞ atom (p_min in REALS). + """ + if dist.spacing_type != SpacingType.GEOMETRIC: + raise ValueError( + f"log_geometric_to_linear requires GEOMETRIC spacing input, got {dist.spacing_type}" + ) + x_min_log = float(np.log(dist.x_min)) + + return DenseDiscreteDist( + x_min=x_min_log, + step=dist.step, + prob_arr=dist.prob_arr.copy(), + p_min=dist.p_min, # 0 atom → −∞ atom (p_min identity preserved) + p_max=dist.p_max, + spacing_type=SpacingType.LINEAR, + domain=Domain.REALS, + ) + + +def negate_reverse_linear_distribution( + dist: DenseDiscreteDist, +) -> DenseDiscreteDist: + """Map X -> -X, reverse PMF order, and swap boundary atoms.""" + n = dist.prob_arr.size + return DenseDiscreteDist( + x_min=-(dist.x_min + dist.step * (n - 1)), + step=dist.step, + prob_arr=np.flip(dist.prob_arr), + p_min=dist.p_max, + p_max=dist.p_min, + ) + + +def calc_pld_dual(realization: PLDRealization) -> PLDRealization: + """Compute the paper PLD dual ``D(L)`` (Definition 3.1). + + Algorithm 7 (`PLD-dual`) in Appendix C. + + For a PLD realization ``L`` with support ``l`` and mass ``f_L(l)``, the dual has: + - finite mass ``f_D(-l) = f_L(l) * exp(-l)``, + - support reflected to ``-l``, + - residual mass at ``+inf``. + """ + if not isinstance(realization, PLDRealization): + raise TypeError(f"calc_pld_dual requires PLDRealization, got {type(realization)}") + + dual_probs_aligned = np.zeros_like(realization.prob_arr) + mask = realization.prob_arr > 0 + dual_probs_aligned[mask] = np.exp( + np.log(realization.prob_arr[mask]) - realization.x_array[mask] + ) + dual_probs = np.flip(dual_probs_aligned) + + sum_prob = math.fsum(map(float, dual_probs)) + if sum_prob > 1.0: + dual_probs *= 1.0 / sum_prob + sum_prob = 1.0 + + return PLDRealization( + x_min=-(realization.x_min + realization.step * (realization.prob_arr.size - 1)), + step=realization.step, + prob_arr=dual_probs, + p_max=max(0.0, 1.0 - sum_prob), + p_min=0.0, + ) + + +# ============================================================================= +# Internal Helper Functions +# ============================================================================= + + +def _align_distributions_to_union_grid( + *, + dist_1: DiscreteDistBase, + dist_2: DiscreteDistBase, +) -> tuple[SparseDiscreteDist, SparseDiscreteDist]: + """Return distributions on a shared grid by inserting zero-mass points.""" + x_union = np.unique(np.concatenate((dist_1.x_array, dist_2.x_array))) + return ( + _expand_to_grid( + dist=dist_1, + grid=x_union, + ), + _expand_to_grid( + dist=dist_2, + grid=x_union, + ), + ) + + +def _expand_to_grid( + *, + dist: DiscreteDistBase, + grid: NDArray[np.float64], +) -> SparseDiscreteDist: + """Insert zero-mass points for missing support values.""" + x = dist.x_array + pmf = dist.prob_arr + expanded_pmf = np.zeros_like(grid, dtype=np.float64) + indices = np.searchsorted(grid, x) + if not np.all(grid[indices] == x): + raise ValueError("Target grid must contain all original support points") + expanded_pmf[indices] = pmf + return SparseDiscreteDist( + x_array=grid, + prob_arr=expanded_pmf, + p_min=dist.p_min, + p_max=dist.p_max, + ) + + +def _ccdf_from_pmf(dist: DiscreteDistBase) -> NDArray[np.float64]: + """Convert distribution PMF to padded complementary CDF. + + Returns CCDF over [−∞/0, l_0, l_1, ..., +∞]: + CCDF[i] = P(X > position[i]) + + Includes both boundary atoms (p_min at the left, p_max at the right). + """ + padded_probs = np.concatenate(([dist.p_min], dist.prob_arr, [dist.p_max])) + return _kahan_reverse_exclusive_cumsum( + padded_probs=padded_probs, + ) + + +@njit(cache=True) +def _kahan_reverse_exclusive_cumsum( + padded_probs: NDArray[np.float64], +) -> NDArray[np.float64]: + """Compute exclusive CCDF using Kahan summation for numerical stability. + + Computes exclusive reverse cumulative sum: CCDF[i] = sum(padded_probs[i+1:]). + Uses Kahan compensated summation to minimize floating-point rounding errors. + """ + n = len(padded_probs) + ccdf = np.zeros(n, dtype=np.float64) + + # Start from the right (highest index) and accumulate backwards + running_sum = 0.0 + compensation = 0.0 + + for i in range(n - 1, -1, -1): + # Store the running sum BEFORE adding current element (exclusive) + ccdf[i] = running_sum + + # Kahan summation: compensated addition of current element + y = padded_probs[i] - compensation + t = running_sum + y + compensation = (t - running_sum) - y + running_sum = t + + return ccdf + + +# ============================================================================= +# Privacy Parameter Validation +# ============================================================================= + + +def validate_privacy_params( + params: PrivacyParams, + *, + require_delta: bool = False, + require_epsilon: bool = False, +) -> None: + """Validate PrivacyParams object. + + Args: + params: Privacy parameters to validate. + require_delta: If True, validate that delta is set and in valid range (0, 1). + require_epsilon: If True, validate that epsilon is set and positive. + + Raises: + TypeError: If params is not a PrivacyParams instance. + ValueError: If any parameter value is invalid. + + """ + if not isinstance(params, PrivacyParams): + raise TypeError(f"params must be PrivacyParams, got {type(params)}") + validate_gaussian_params(params.sigma, params.num_steps, params.num_selected, params.num_epochs) + if require_delta: + validate_delta(params.delta) + if require_epsilon: + validate_epsilon(params.epsilon) + + +def validate_gaussian_params( + sigma: float, + num_steps: int, + num_selected: int, + num_epochs: int, +) -> None: + """Validate Gaussian allocation parameters. + + Args: + sigma: Gaussian noise scale. + num_steps: Total number of random-allocation steps. + num_selected: Number of selections per epoch. + num_epochs: Number of epochs. + + Raises: + ValueError: If any parameter value is invalid. + + """ + if sigma <= 0: + raise ValueError(f"sigma must be positive, got {sigma}") + validate_allocation_params(num_steps, num_selected, num_epochs) + + +def validate_allocation_params( + num_steps: int, + num_selected: int, + num_epochs: int, +) -> None: + """Validate allocation parameters. + + Args: + num_steps: Total number of random-allocation steps. + num_selected: Number of selections per epoch. + num_epochs: Number of epochs. + + Raises: + ValueError: If any parameter value is invalid. + + """ + if num_steps < 1 or num_selected < 1 or num_epochs < 1: + raise ValueError( + f"num_steps (={num_steps}), num_selected (={num_selected}), " + f"and num_epochs (={num_epochs}) must be >= 1" + ) + if num_selected > num_steps: + raise ValueError(f"num_selected ({num_selected}) cannot exceed num_steps ({num_steps})") + + +def validate_delta(delta: float | None) -> None: + """Validate delta value. + + Args: + delta: Delta value for differential privacy. + + Raises: + ValueError: If delta is None or not in the valid range (0, 1). + + """ + if delta is None or not 0 < delta < 1: + raise ValueError(f"delta must be in (0, 1), got {delta}") + + +def validate_epsilon(epsilon: float | None) -> None: + """Validate epsilon value. + + Args: + epsilon: Epsilon value for differential privacy. + + Raises: + ValueError: If epsilon is None or not positive. + + """ + if epsilon is None or epsilon <= 0: + raise ValueError(f"epsilon must be positive, got {epsilon}") + + +# ============================================================================= +# Bound Type Validation +# ============================================================================= + + +def validate_bound_type(bound_type: BoundType) -> None: + """Validate BoundType enum value. + + Args: + bound_type: The bound type to validate. + + Raises: + ValueError: If bound_type is not DOMINATES or IS_DOMINATED. + + """ + if bound_type not in (BoundType.DOMINATES, BoundType.IS_DOMINATED): + raise ValueError(f"Invalid bound_type: {bound_type}") + + +# ============================================================================= +# Discretization Parameter Validation +# ============================================================================= + + +def validate_discretization_params( + loss_discretization: float, + tail_truncation: float, +) -> None: + """Validate discretization parameters. + + Args: + loss_discretization: Loss discretization interval. + tail_truncation: Tail truncation threshold. + + Raises: + ValueError: If any parameter is invalid. + + """ + if loss_discretization <= 0: + raise ValueError(f"loss_discretization must be positive, got {loss_discretization}") + if tail_truncation <= 0: + raise ValueError(f"tail_truncation must be positive, got {tail_truncation}") + + +def validate_allocation_scheme_config(config: AllocationSchemeConfig) -> None: + """Validate AllocationSchemeConfig fields. + + Args: + config: Configuration to validate. + + Raises: + TypeError: If config is not an AllocationSchemeConfig instance. + ValueError: If any field value is out of range. + + """ + if not isinstance(config, AllocationSchemeConfig): + raise TypeError(f"config must be AllocationSchemeConfig, got {type(config)}") + validate_discretization_params(config.loss_discretization, config.tail_truncation) + if config.max_grid_mult != -1 and config.max_grid_mult <= 0: + raise ValueError( + f"max_grid_mult must be -1 (no limit) or a positive integer, " + f"got {config.max_grid_mult}" + ) + + +def validate_optional_discretization_params( + initial_discretization: float | None = None, + initial_tail_truncation: float | None = None, +) -> None: + """Validate optional discretization parameters. + + Args: + initial_discretization: Optional initial loss discretization interval. + initial_tail_truncation: Optional initial tail truncation threshold. + + Raises: + ValueError: If any provided parameter is invalid. + + """ + if initial_discretization is not None and initial_discretization <= 0: + raise ValueError(f"initial_discretization must be positive, got {initial_discretization}") + if initial_tail_truncation is not None and initial_tail_truncation <= 0: + raise ValueError(f"initial_tail_truncation must be positive, got {initial_tail_truncation}")