diff --git a/Cargo.toml b/Cargo.toml index 6905df8..fb418e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -96,6 +96,7 @@ resampling = ["dep:rubato", "dep:rayon", "audioadapter", "audioadapter-buffers"] vad = [] # Voice activity detection operations psychoacoustic = ["transforms"] # Psychoacoustic analysis: band layouts, masking thresholds, SMR parallel = ["dep:rayon"] # Parallel codec encoding/decoding via rayon (works with psychoacoustic) +opus-codec = ["psychoacoustic"] # Opus-inspired codec: SILK (speech) and CELT (music) modes plotting = ["dep:plotly", "dep:base64", "dep:serde_json", "channels", "transforms"] envelopes = [ "dynamic-range", @@ -127,6 +128,7 @@ full = [ "random-generation", "psychoacoustic", "parallel", + "opus-codec", ] full_no_plotting = [ @@ -149,6 +151,7 @@ full_no_plotting = [ "envelopes", "random-generation", "psychoacoustic", + "opus-codec", ] educational = ["dep:explainable", "dep:term-maths", "dep:open", "processing"] diff --git a/src/codecs/mod.rs b/src/codecs/mod.rs index 54e694b..117ba72 100644 --- a/src/codecs/mod.rs +++ b/src/codecs/mod.rs @@ -8,6 +8,9 @@ //! together with the [`PerceptualCodec`] and [`StereoPerceptualCodec`] implementations //! that use psychoacoustic masking to drive perceptual quantization. //! +//! An Opus-inspired codec ([`opus::OpusCodec`]) is also available under the +//! `opus-codec` feature. It supports SILK (speech) and CELT (music) modes. +//! //! ## Why //! //! `audio_samples` exposes the full signal-processing toolkit that perceptual codecs @@ -35,5 +38,11 @@ pub mod perceptual; +/// Opus codec skeleton: SILK (speech) and CELT (music) modes. +/// +/// Requires the `opus-codec` feature flag. +#[cfg(feature = "opus-codec")] +pub mod opus; + pub use perceptual::codec::{AudioCodec, decode, encode}; pub use perceptual::stereo::{StereoPerceptualCodec, StereoPerceptualEncodedAudio}; diff --git a/src/codecs/opus/celt.rs b/src/codecs/opus/celt.rs new file mode 100644 index 0000000..e0bf369 --- /dev/null +++ b/src/codecs/opus/celt.rs @@ -0,0 +1,169 @@ +//! CELT wideband audio codec: MDCT + psychoacoustic bit allocation for Opus's music mode. +//! +//! ## What +//! +//! Implements one encode/decode cycle for a single Opus CELT audio frame. +//! Each frame runs the full existing perceptual codec pipeline from +//! [`crate::codecs::perceptual`]: +//! +//! 1. MDCT analysis (window size = frame length). +//! 2. Psychoacoustic masking → per-band importance and allowed noise. +//! 3. Bit allocation: distribute the bit budget across bands proportional to +//! perceptual importance. +//! 4. Scalar quantisation of MDCT coefficients per band. +//! +//! Decoding runs the inverse: dequantise → IMDCT with overlap-add. +//! +//! ## Why +//! +//! CELT is the wideband, low-latency half of Opus. It analyses and codes the +//! entire spectrum in one MDCT block matching the frame length, making it ideal +//! for music and generic audio. Unlike SILK, it makes no speech-specific +//! assumptions. +//! +//! ## Relationship to `PerceptualCodec` +//! +//! [`celt_encode_frame`] is essentially one call to the internal +//! `encode_segment` helper from `PerceptualCodec`, scoped to a single Opus +//! frame. The [`CeltEncodedFrame`] type mirrors +//! [`crate::codecs::perceptual::codec::EncodedSegment`] with the addition of +//! the per-frame sample count. + +use std::num::{NonZeroU32, NonZeroUsize}; + +use non_empty_slice::NonEmptyVec; +use spectrograms::{MdctParams, WindowType}; + +use crate::codecs::perceptual::quantization::{BitAllocationResult, allocate_bits, dequantize, quantize}; +use crate::codecs::perceptual::{BandLayout, PsychoacousticConfig, analyse_signal_with_window_size, reconstruct_signal}; +use crate::{AudioSampleResult, AudioSamples, StandardSample}; + +// ── CeltEncodedFrame ────────────────────────────────────────────────────────── + +/// One CELT-encoded Opus audio frame. +/// +/// The in-memory representation is equivalent to +/// [`crate::codecs::perceptual::codec::EncodedSegment`] scoped to a single +/// Opus frame. The `window_size` equals the frame length, so `n_frames` is +/// typically 1 or 2 depending on how the MDCT hop interacts with the frame +/// boundary. +/// +/// Everything needed to reconstruct the frame is self-contained: MDCT +/// parameters, per-band bit allocation, and the original sample count. +#[derive(Debug, Clone)] +pub struct CeltEncodedFrame { + /// Quantised MDCT coefficients, row-major: index `k × n_frames + f`. + pub quantized: NonEmptyVec, + /// Number of MDCT bins per frame (`window_size / 2`). + pub n_coefficients: NonZeroUsize, + /// Number of MDCT analysis frames produced from this Opus frame. + pub n_frames: NonZeroUsize, + /// MDCT parameters used during analysis. + pub mdct_params: MdctParams, + /// Per-band bit allocation used for quantisation and dequantisation. + pub allocation: BitAllocationResult, + /// Number of PCM samples in the original Opus frame. + pub n_samples: usize, +} + +// ── celt_encode_frame ───────────────────────────────────────────────────────── + +/// Encodes a single CELT audio frame. +/// +/// The frame is analysed with the MDCT, processed through the psychoacoustic +/// model, and the resulting coefficients are quantised with the per-band +/// allocation from [`allocate_bits`]. +/// +/// # Arguments +/// - `frame` – Mono audio frame to encode. +/// - `band_layout` – Perceptual frequency-band partitioning (e.g. [`crate::BandLayout::celt`]). +/// - `psych_config` – Psychoacoustic masking configuration. Must have the same +/// number of weights as `band_layout.len()`. +/// - `window` – MDCT window function. [`spectrograms::WindowType::Hanning`] is a +/// reasonable default. +/// - `window_size` – Explicit MDCT window size (typically `= frame_length`, i.e. +/// the number of samples in `frame`). When `None`, an automatic size ≤ 2048 is +/// chosen. +/// - `bit_budget` – Total bits to allocate across all bands. +/// - `min_bits_per_band` – Minimum bits guaranteed to every band (typically 1). +/// +/// # Errors +/// Returns [`crate::AudioSampleError::Parameter`] if `frame` is not mono, is +/// fewer than 4 samples, or `psych_config` is incompatible with `band_layout`. +pub fn celt_encode_frame( + frame: &AudioSamples, + band_layout: &BandLayout, + psych_config: &PsychoacousticConfig, + window: WindowType, + window_size: Option, + bit_budget: u32, + min_bits_per_band: u8, +) -> AudioSampleResult { + let n_samples = frame.samples_per_channel().get(); + + let result = + analyse_signal_with_window_size(frame, window, window_size, band_layout, psych_config)?; + + let allocation = allocate_bits(&result.band_metrics, bit_budget, min_bits_per_band); + let quantized = quantize( + result.coefficients.as_non_empty_slice(), + result.n_coefficients, + result.n_frames, + &allocation, + ); + + Ok(CeltEncodedFrame { + quantized, + n_coefficients: result.n_coefficients, + n_frames: result.n_frames, + mdct_params: result.mdct_params, + allocation, + n_samples, + }) +} + +// ── celt_decode_frame ───────────────────────────────────────────────────────── + +/// Decodes a CELT-encoded Opus audio frame. +/// +/// Dequantises the MDCT coefficients and applies the IMDCT with overlap-add to +/// reconstruct the time-domain signal. +/// +/// # Arguments +/// - `frame` – A CELT frame produced by [`celt_encode_frame`]. +/// - `sample_rate` – Sample rate for the returned audio. +/// +/// # Errors +/// Returns [`crate::AudioSampleError`] if the IMDCT reconstruction fails. +/// +/// # Returns +/// A `Vec` of `frame.n_samples` reconstructed PCM samples. +pub fn celt_decode_frame( + frame: CeltEncodedFrame, + sample_rate: NonZeroU32, +) -> AudioSampleResult> { + let coefficients = dequantize( + frame.quantized.as_non_empty_slice(), + frame.n_coefficients, + frame.n_frames, + &frame.allocation, + ); + + let audio = reconstruct_signal( + &coefficients, + frame.n_coefficients, + frame.n_frames, + &frame.mdct_params, + Some(frame.n_samples), + sample_rate, + )?; + + let ch = audio + .channels() + .next() + .expect("reconstruct_signal always returns mono"); + Ok(ch + .as_slice() + .expect("mono channel is always contiguous") + .to_vec()) +} diff --git a/src/codecs/opus/codec.rs b/src/codecs/opus/codec.rs new file mode 100644 index 0000000..7667961 --- /dev/null +++ b/src/codecs/opus/codec.rs @@ -0,0 +1,378 @@ +//! [`OpusCodec`], [`OpusEncodedAudio`], and the [`AudioCodec`] implementation. +//! +//! ## What +//! +//! `OpusCodec` segments a mono audio signal into fixed-length frames and encodes +//! each frame independently using either SILK (speech) or CELT (music/generic), +//! producing an [`OpusEncodedAudio`] value that carries all information needed +//! for reconstruction. +//! +//! ## How +//! +//! ```rust,ignore +//! use audio_samples::codecs::{encode, decode}; +//! use audio_samples::codecs::opus::{OpusCodec, OpusConfig, OpusMode}; +//! use spectrograms::WindowType; +//! +//! // Encode with CELT (music mode), 20 ms frames, 128 kbps budget. +//! let config = OpusConfig::with_mode(OpusMode::Celt, 128_000); +//! let codec = OpusCodec::new(config, WindowType::Hanning); +//! let encoded = encode(&audio, codec)?; +//! +//! // Decode back to f32. +//! let recovered = decode::(encoded)?; +//! ``` +//! +//! ## Sketch limitations +//! +//! - Hybrid mode falls back to CELT. +//! - SILK frames use zero initial state (no cross-frame LPC memory). +//! - No range coding or bitstream packing — that belongs in `audio_samples_io`. +//! - CELT window size equals the frame length (not Opus's fixed 20 ms / 960 sample block). + +use std::num::{NonZeroU32, NonZeroUsize}; + +use non_empty_slice::NonEmptyVec; +use spectrograms::WindowType; + +use crate::codecs::perceptual::codec::AudioCodec; +use crate::codecs::perceptual::{BandLayout, PsychoacousticConfig}; +use crate::traits::AudioTypeConversion; +use crate::{AudioSampleError, AudioSampleResult, AudioSamples, ParameterError, StandardSample}; + +use super::celt::{CeltEncodedFrame, celt_decode_frame, celt_encode_frame}; +use super::mode::{OpusConfig, OpusMode, detect_mode}; +use super::silk::{SilkEncodedFrame, silk_decode_frame, silk_encode_frame}; + +// ── OpusFrameData ───────────────────────────────────────────────────────────── + +/// Codec-specific payload for one encoded Opus frame. +/// +/// The active variant matches the [`OpusEncodedFrame::mode`] field. +#[derive(Debug, Clone)] +pub enum OpusFrameData { + /// SILK-encoded payload: LPC coefficients and quantised residual. + Silk(SilkEncodedFrame), + /// CELT-encoded payload: quantised MDCT coefficients and bit allocation. + Celt(CeltEncodedFrame), +} + +// ── OpusEncodedFrame ────────────────────────────────────────────────────────── + +/// One encoded Opus audio frame. +/// +/// Stores the operating mode, the encoded payload, and the number of PCM +/// samples in the original frame (needed for accurate truncation on decode). +#[derive(Debug, Clone)] +pub struct OpusEncodedFrame { + /// Mode used to encode this frame. + pub mode: OpusMode, + /// Codec-specific encoded payload. + pub data: OpusFrameData, + /// Number of PCM samples in the original frame. + pub n_samples: usize, +} + +// ── OpusEncodedAudio ────────────────────────────────────────────────────────── + +/// In-memory encoded audio produced by [`OpusCodec`]. +/// +/// Contains one [`OpusEncodedFrame`] per audio frame in temporal order. +/// Everything needed to reconstruct the original signal is embedded: LPC +/// coefficients, MDCT parameters, per-band quantisation step sizes, and the +/// total original signal length. +#[derive(Debug, Clone)] +pub struct OpusEncodedAudio { + /// Encoded frames in temporal order. + pub frames: NonEmptyVec, + /// Total original signal length in samples. + pub original_length: usize, + /// Sample rate of the original signal. + pub sample_rate: NonZeroU32, +} + +// ── OpusCodec ───────────────────────────────────────────────────────────────── + +/// An Opus-inspired perceptual codec supporting SILK (speech) and CELT (music) modes. +/// +/// ## What +/// +/// `OpusCodec` segments audio into fixed-length frames (default 20 ms) and +/// encodes each frame using either: +/// +/// - **SILK** — order-16 LPC + 16-bit residual quantisation (speech). +/// - **CELT** — MDCT + CELT-band psychoacoustic masking (music/generic). +/// +/// The mode is either fixed via [`OpusConfig::mode`] or detected per-frame with +/// a spectral-flatness heuristic ([`detect_mode`]). +/// +/// ## Architecture +/// +/// ```text +/// AudioSamples +/// └── frame 0 ──► detect_mode ──► SILK encode ──► OpusEncodedFrame(Silk) +/// └── frame 1 ──► detect_mode ──► CELT encode ──► OpusEncodedFrame(Celt) +/// └── … +/// └── OpusEncodedAudio { frames, original_length, sample_rate } +/// ``` +/// +/// ## Core design decisions +/// +/// - The CELT band layout is auto-derived from the frame size and signal sample +/// rate unless [`OpusCodec::with_perceptual_config`] provides an explicit one. +/// - Hybrid mode is defined in the type system but currently falls back to CELT. +/// +/// ## What lives in the IO crate +/// +/// Bitstream packing, range coding, Ogg encapsulation, and the `.opus` container +/// format are responsibilities of `audio_samples_io`, not this crate. +/// +/// ## Example +/// +/// ```rust,ignore +/// use audio_samples::codecs::{encode, decode}; +/// use audio_samples::codecs::opus::{OpusCodec, OpusConfig, OpusMode}; +/// use spectrograms::WindowType; +/// +/// let config = OpusConfig::with_mode(OpusMode::Celt, 128_000); +/// let codec = OpusCodec::new(config, WindowType::Hanning); +/// +/// let encoded = encode(&audio, codec)?; +/// let recovered = decode::(encoded)?; +/// ``` +#[derive(Debug, Clone)] +pub struct OpusCodec { + /// Codec configuration: mode, bandwidth, bit budget, frame size. + pub config: OpusConfig, + /// Explicit CELT band layout. When `None`, a CELT-style layout is + /// auto-constructed from the signal's sample rate and frame size. + pub band_layout: Option, + /// Psychoacoustic masking config for CELT frames. When `None`, MPEG-1 + /// parameters with uniform weights are used. + pub psych_config: Option, + /// MDCT window function for CELT frames. + pub window: WindowType, +} + +impl OpusCodec { + /// Creates an `OpusCodec` with automatic CELT band layout and MPEG-1 + /// psychoacoustic parameters. + /// + /// The CELT band layout is derived from the signal's sample rate and frame + /// size at encode time, so it adapts to any input sample rate. + /// + /// # Arguments + /// - `config` – Codec configuration (mode, bandwidth, bit budget, frame size). + /// - `window` – MDCT window function for CELT frames. + #[inline] + #[must_use] + pub fn new(config: OpusConfig, window: WindowType) -> Self { + Self { + config, + band_layout: None, + psych_config: None, + window, + } + } + + /// Creates an `OpusCodec` with explicit CELT perceptual configuration. + /// + /// Use this variant when you need full control over the CELT band layout and + /// psychoacoustic parameters (e.g., for a fixed target sample rate). + /// + /// # Arguments + /// - `config` – Codec configuration. + /// - `window` – MDCT window function. + /// - `band_layout` – Explicit perceptual band layout for CELT frames. + /// - `psych_config` – Psychoacoustic masking parameters. Must be compatible + /// with `band_layout`. + #[inline] + #[must_use] + pub fn with_perceptual_config( + config: OpusConfig, + window: WindowType, + band_layout: BandLayout, + psych_config: PsychoacousticConfig, + ) -> Self { + Self { + config, + band_layout: Some(band_layout), + psych_config: Some(psych_config), + window, + } + } +} + +// ── Internal helpers ────────────────────────────────────────────────────────── + +/// Computes the frame size in samples for `sample_rate` and `frame_size_ms`. +/// +/// The result is rounded to the nearest even integer (MDCT requirement) with a +/// minimum of 4. +fn compute_frame_size(sample_rate: u32, frame_size_ms: f32) -> usize { + let raw = (sample_rate as f32 * frame_size_ms / 1000.0).round() as usize; + let even = if raw % 2 == 0 { raw } else { raw + 1 }; + even.max(4) +} + +/// Returns the CELT band layout to use, falling back to auto-construction. +fn resolve_band_layout( + band_layout: &Option, + sample_rate: u32, + n_bins: NonZeroUsize, +) -> BandLayout { + band_layout + .clone() + .unwrap_or_else(|| BandLayout::celt(sample_rate as f32, n_bins)) +} + +/// Returns the psychoacoustic config to use, falling back to MPEG-1 with +/// uniform weights for `n_bands` bands. +fn resolve_psych_config( + psych_config: &Option, + n_bands: NonZeroUsize, +) -> PsychoacousticConfig { + psych_config.clone().unwrap_or_else(|| { + let weights = PsychoacousticConfig::uniform_weights(n_bands); + PsychoacousticConfig::mpeg1(weights.as_non_empty_slice()) + }) +} + +// ── AudioCodec impl ─────────────────────────────────────────────────────────── + +impl AudioCodec for OpusCodec { + type Encoded = OpusEncodedAudio; + + fn encode( + self, + audio: &AudioSamples, + ) -> AudioSampleResult { + if audio.num_channels().get() != 1 { + return Err(AudioSampleError::Parameter(ParameterError::invalid_value( + "audio", + "OpusCodec requires mono input; mix down or extract a channel first", + ))); + } + + let sample_rate = audio.sample_rate(); + let original_length = audio.samples_per_channel().get(); + + // Frame size and CELT band layout — computed once for all frames. + let frame_size = compute_frame_size(sample_rate.get(), self.config.frame_size_ms); + // `compute_frame_size` always returns an even number >= 4 (see its doc), + // so `frame_size / 2 >= 2` and `NonZeroUsize::new` always succeeds here. + let n_bins = NonZeroUsize::new(frame_size / 2) + .expect("frame_size is always even and >= 4, so frame_size/2 >= 2"); + let band_layout = resolve_band_layout(&self.band_layout, sample_rate.get(), n_bins); + let n_bands = band_layout.len(); + let psych_config = resolve_psych_config(&self.psych_config, n_bands); + + // Convert to f32 once. + let audio_f32 = audio.to_format::(); + let channel = audio_f32 + .channels() + .next() + .expect("mono channel validated above"); + let all_samples: &[f32] = channel.as_slice().expect("mono is contiguous"); + + let mut frames: Vec = Vec::new(); + let mut offset = 0; + + while offset < original_length { + let end = (offset + frame_size).min(original_length); + let frame_samples = &all_samples[offset..end]; + let n_samples = frame_samples.len(); + + // Determine mode for this frame. + let raw_mode = self.config.mode.unwrap_or_else(|| { + detect_mode(frame_samples, sample_rate.get(), self.config.bandwidth) + }); + + // Hybrid falls back to CELT in this sketch; + // frames shorter than 4 samples are forced to SILK to avoid MDCT issues. + let effective_mode = if n_samples < 4 { + OpusMode::Silk + } else { + match raw_mode { + OpusMode::Hybrid => OpusMode::Celt, + other => other, + } + }; + + let encoded_frame = match effective_mode { + OpusMode::Silk => { + let silk_frame = silk_encode_frame(frame_samples)?; + OpusEncodedFrame { + mode: OpusMode::Silk, + data: OpusFrameData::Silk(silk_frame), + n_samples, + } + } + OpusMode::Celt | OpusMode::Hybrid => { + // Build a temporary mono AudioSamples for this frame. + let ne = NonEmptyVec::new(frame_samples.to_vec()) + .map_err(|_| AudioSampleError::EmptyData)?; + let frame_audio: AudioSamples<'static, f32> = + AudioSamples::from_mono_vec(ne, sample_rate); + + // Window size = frame length (even, >= 4). + let window_size = NonZeroUsize::new(n_samples & !1_usize) + .unwrap_or_else(|| NonZeroUsize::new(4).expect("4 > 0")); + + let celt_frame = celt_encode_frame( + &frame_audio, + &band_layout, + &psych_config, + self.window.clone(), + Some(window_size), + self.config.bit_budget, + self.config.min_bits_per_band, + )?; + OpusEncodedFrame { + mode: OpusMode::Celt, + data: OpusFrameData::Celt(celt_frame), + n_samples, + } + } + }; + + frames.push(encoded_frame); + offset = end; + } + + let frames_ne = NonEmptyVec::new(frames).map_err(|_| AudioSampleError::EmptyData)?; + + Ok(OpusEncodedAudio { + frames: frames_ne, + original_length, + sample_rate, + }) + } + + fn decode( + encoded: Self::Encoded, + ) -> AudioSampleResult> + where + f32: crate::ConvertFrom, + { + let sample_rate = encoded.sample_rate; + let target_length = encoded.original_length; + + let mut all_samples: Vec = Vec::with_capacity(target_length); + + for frame in encoded.frames.into_vec() { + let frame_samples = match frame.data { + OpusFrameData::Silk(silk_frame) => silk_decode_frame(&silk_frame), + OpusFrameData::Celt(celt_frame) => celt_decode_frame(celt_frame, sample_rate)?, + }; + all_samples.extend(frame_samples); + } + + all_samples.truncate(target_length); + + let samples_ne = NonEmptyVec::new(all_samples).map_err(|_| AudioSampleError::EmptyData)?; + let f32_audio: AudioSamples<'static, f32> = + AudioSamples::from_mono_vec(samples_ne, sample_rate); + Ok(f32_audio.to_format::()) + } +} diff --git a/src/codecs/opus/lpc.rs b/src/codecs/opus/lpc.rs new file mode 100644 index 0000000..8afbd11 --- /dev/null +++ b/src/codecs/opus/lpc.rs @@ -0,0 +1,293 @@ +//! Linear Prediction Coding (LPC) primitives for the Opus SILK speech codec mode. +//! +//! ## What +//! +//! Implements the Levinson–Durbin algorithm for computing LPC coefficients from +//! the windowed autocorrelation of a signal frame, together with the matching +//! analysis (residual computation) and synthesis (reconstruction) filters. +//! +//! ## Why +//! +//! SILK, the speech-oriented half of Opus, models the vocal-tract as an all-pole +//! filter `H(z) = G / A(z)` where `A(z)` is the LPC polynomial. The encoder +//! transmits the LPC coefficients and the prediction residual; the decoder runs +//! the synthesis filter to recover the original waveform. This module provides +//! all the building blocks that `silk.rs` needs. +//! +//! ## Round-trip correctness +//! +//! When both encoder and decoder use **zero initial state** at the start of each +//! frame the analysis/synthesis pair forms a perfect inverse: +//! +//! ```text +//! Analysis: e[n] = x[n] + Σ_{k=0}^{p-1} a[k] · x[n-1-k] +//! Synthesis: y[n] = e[n] − Σ_{k=0}^{p-1} a[k] · y[n-1-k] +//! ``` +//! +//! Substituting: `y[0] = e[0] = x[0]`, `y[1] = e[1] − a[0]·y[0] = x[1]`, etc. +//! The reconstruction is exact up to floating-point precision; quantization of +//! the residual is the only lossy step. + +/// Small diagonal loading factor applied to `R[0]` for numerical stability. +/// +/// Biases the autocorrelation matrix slightly away from singularity, which +/// can occur for signals with near-zero energy. Value follows the SILK +/// reference code convention (0.001% perturbation of the zero-lag energy). +const DIAGONAL_LOADING_EPSILON: f64 = 1e-5; + +/// Minimum prediction error to prevent numerical underflow in Levinson–Durbin. +/// +/// Clamps the error after each Levinson step. At this scale, the filter is +/// essentially an all-pass and further iterations are numerically meaningless. +const MIN_PREDICTION_ERROR: f64 = 1e-15; + +/// Default LPC predictor order used by SILK. +/// +/// SILK uses a 16th-order LP filter for narrowband/wideband speech coding. +/// Higher orders capture more spectral detail at increased computational cost. +pub const SILK_LPC_ORDER: usize = 16; + +// ── LpcCoefficients ─────────────────────────────────────────────────────────── + +/// LPC predictor coefficients and prediction error produced by Levinson–Durbin. +/// +/// `coeffs[k]` is the coefficient `a[k+1]` in the all-zero analysis filter: +/// +/// ```text +/// A(z) = 1 + a[1]·z⁻¹ + a[2]·z⁻² + … + a[p]·z⁻ᵖ +/// ``` +/// +/// The analysis filter computes `e[n] = x[n] + Σ a[k+1]·x[n-k-1]`, i.e. the +/// coefficient stored at `coeffs[k]` is the gain on `x[n-k-1]`. +#[derive(Debug, Clone)] +pub struct LpcCoefficients { + /// Predictor coefficients `a[1..=order]`, zero-indexed. + /// Length equals the order requested in [`lpc_analysis`]. + pub coeffs: Vec, + /// Residual prediction error energy after the final Levinson–Durbin step. + /// + /// A value near zero indicates the predictor nearly perfectly models the + /// signal. Always non-negative; protected against underflow to `1e-15`. + pub prediction_error: f64, +} + +// ── Autocorrelation ─────────────────────────────────────────────────────────── + +/// Computes the autocorrelation of `samples` up to lag `max_lag` (inclusive). +/// +/// A Hamming window is applied to the samples before computing correlations. +/// This reduces spectral leakage and improves the conditioning of the Toeplitz +/// system solved by Levinson–Durbin, at the cost of slightly underestimating +/// the true autocorrelation. +/// +/// Returns a `Vec` of length `max_lag + 1` where index `k` holds `R[k]`. +/// +/// # Arguments +/// - `samples` – Input signal frame. If empty, returns all-zero autocorrelation. +/// - `max_lag` – Maximum lag to compute. Clamped to `samples.len() − 1`. +#[must_use] +pub fn compute_autocorrelation(samples: &[f32], max_lag: usize) -> Vec { + let n = samples.len(); + if n == 0 { + return vec![0.0; max_lag + 1]; + } + + // Apply a Hamming window for stability (avoids spectral leakage). + let windowed: Vec = samples + .iter() + .enumerate() + .map(|(i, &x)| { + let cos_arg = if n > 1 { + 2.0 * std::f64::consts::PI * i as f64 / (n - 1) as f64 + } else { + 0.0 + }; + let w = 0.54 - 0.46 * cos_arg.cos(); + x as f64 * w + }) + .collect(); + + let effective_max = max_lag.min(n - 1); + (0..=effective_max) + .map(|lag| { + (0..n - lag) + .map(|i| windowed[i] * windowed[i + lag]) + .sum::() + }) + .collect() +} + +// ── Levinson–Durbin ─────────────────────────────────────────────────────────── + +/// Computes LPC coefficients from autocorrelation using the Levinson–Durbin algorithm. +/// +/// Solves the Yule–Walker equations `R · a = −r` for the predictor coefficient +/// vector `a` of length `order`. Returns `None` if the autocorrelation is near +/// zero (silent or near-silent frame). +/// +/// A small diagonal loading (`R[0] × 10⁻⁵`) is applied before the recursion to +/// prevent numerical instability in near-singular cases. +/// +/// # Arguments +/// - `autocorr` – Autocorrelation values `R[0..=order]`. Must have length ≥ `order + 1`. +/// - `order` – Predictor order. +/// +/// # Returns +/// `Some(LpcCoefficients)` on success, `None` for a zero-energy frame. +#[must_use] +pub fn levinson_durbin(autocorr: &[f64], order: usize) -> Option { + if autocorr.len() < order + 1 || autocorr[0] < 1e-12 { + return None; + } + + // Diagonal loading for numerical stability. + let mut r = autocorr.to_vec(); + r[0] *= 1.0 + DIAGONAL_LOADING_EPSILON; + + let mut a = vec![0.0f64; order]; + let mut a_prev = vec![0.0f64; order]; + let mut error = r[0]; + + for m in 0..order { + // Compute the reflection coefficient k_m. + let mut num = r[m + 1]; + for i in 0..m { + num += a_prev[i] * r[m - i]; + } + let k = -num / error; + + // Update predictor coefficients. + for i in 0..m { + a[i] = a_prev[i] + k * a_prev[m - 1 - i]; + } + a[m] = k; + + // Update prediction error; clamp against underflow. + error = (error * (1.0 - k * k)).max(MIN_PREDICTION_ERROR); + a_prev.clone_from(&a); + } + + let coeffs = a.iter().map(|&v| v as f32).collect(); + Some(LpcCoefficients { coeffs, prediction_error: error }) +} + +// ── High-level analysis entry point ────────────────────────────────────────── + +/// Performs LPC analysis on a signal frame. +/// +/// Computes the windowed autocorrelation up to `order` lags and solves +/// Levinson–Durbin to obtain the predictor coefficients. If the frame is +/// silent or too short for the requested order, a zero-coefficient predictor +/// is returned. +/// +/// The effective order is clamped to `min(order, samples.len() / 2)` so +/// that the autocorrelation matrix always has enough data to be well-posed. +/// +/// # Arguments +/// - `samples` – Input signal frame. Must be non-empty. +/// - `order` – Predictor order. Use [`SILK_LPC_ORDER`] for SILK-standard quality. +#[must_use] +pub fn lpc_analysis(samples: &[f32], order: usize) -> LpcCoefficients { + let effective_order = order.min(samples.len() / 2).max(1); + let autocorr = compute_autocorrelation(samples, effective_order); + levinson_durbin(&autocorr, effective_order).unwrap_or_else(|| LpcCoefficients { + coeffs: vec![0.0; effective_order], + prediction_error: 1.0, + }) +} + +// ── Analysis filter ─────────────────────────────────────────────────────────── + +/// Applies the LPC analysis (all-zero) filter to produce the prediction residual. +/// +/// Computes: `e[n] = x[n] + Σ_{k=0}^{order−1} coeffs[k] · x[n−1−k]` +/// +/// Uses **zero initial state** (all past inputs are treated as 0 before the +/// frame starts). The decoder's [`lpc_synthesis`] uses the same convention, +/// which guarantees a perfect round-trip within a single frame. +/// +/// # Arguments +/// - `samples` – Input signal frame. +/// - `coeffs` – LPC coefficients from [`lpc_analysis`]. +/// +/// # Returns +/// A `Vec` of the same length as `samples` containing the residual. +#[must_use] +pub fn lpc_residual(samples: &[f32], coeffs: &LpcCoefficients) -> Vec { + let order = coeffs.coeffs.len(); + let mut residual = Vec::with_capacity(samples.len()); + for n in 0..samples.len() { + let mut sum = 0.0f32; + for k in 0..order.min(n) { + sum += coeffs.coeffs[k] * samples[n - 1 - k]; + } + residual.push(samples[n] + sum); + } + residual +} + +// ── Synthesis filter ────────────────────────────────────────────────────────── + +/// Applies the LPC synthesis (all-pole) filter to reconstruct the signal. +/// +/// Computes: `y[n] = e[n] − Σ_{k=0}^{order−1} coeffs[k] · y[n−1−k]` +/// +/// This is the exact inverse of [`lpc_residual`] when both use zero initial +/// state. The only reconstruction error comes from quantizing the residual +/// in the SILK encode step. +/// +/// # Arguments +/// - `residual` – Prediction residual from [`lpc_residual`] (or its quantised +/// approximation from the SILK encoder). +/// - `coeffs` – LPC coefficients matching those used during analysis. +/// +/// # Returns +/// A `Vec` of the same length as `residual`. +#[must_use] +pub fn lpc_synthesis(residual: &[f32], coeffs: &LpcCoefficients) -> Vec { + let order = coeffs.coeffs.len(); + let mut output = Vec::with_capacity(residual.len()); + for n in 0..residual.len() { + let mut sum = 0.0f32; + for k in 0..order.min(n) { + sum += coeffs.coeffs[k] * output[n - 1 - k]; + } + output.push(residual[n] - sum); + } + output +} + +// ── Unit tests ──────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + /// Verifies that the analysis/synthesis pair is a perfect inverse (no + /// quantization) when both use zero initial state. + #[test] + fn round_trip_no_quantization() { + let samples: Vec = (0..64) + .map(|i| (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 44100.0).sin() * 0.5) + .collect(); + + let coeffs = lpc_analysis(&samples, SILK_LPC_ORDER); + let residual = lpc_residual(&samples, &coeffs); + let recovered = lpc_synthesis(&residual, &coeffs); + + for (orig, rec) in samples.iter().zip(recovered.iter()) { + assert!( + (orig - rec).abs() < 1e-4, + "round-trip error {:.2e} > 1e-4", + (orig - rec).abs() + ); + } + } + + /// Confirms Levinson–Durbin returns `None` for a zero-energy frame. + #[test] + fn levinson_silent_frame() { + let autocorr = vec![0.0f64; SILK_LPC_ORDER + 1]; + assert!(levinson_durbin(&autocorr, SILK_LPC_ORDER).is_none()); + } +} diff --git a/src/codecs/opus/mod.rs b/src/codecs/opus/mod.rs new file mode 100644 index 0000000..d682f2c --- /dev/null +++ b/src/codecs/opus/mod.rs @@ -0,0 +1,64 @@ +//! Opus codec skeleton: SILK (speech) and CELT (music) modes. +//! +//! ## Overview +//! +//! This module provides a structural sketch of the Opus codec (RFC 6716) within +//! the `audio_samples` perceptual codec framework. It is gated on the +//! `opus-codec` feature flag. +//! +//! The implementation splits into four sub-modules that mirror the real Opus +//! architecture: +//! +//! | Module | Purpose | +//! |--------|---------| +//! [`lpc`] | Levinson–Durbin LPC analysis/synthesis primitives used by SILK. | +//! [`mode`] | `OpusMode`, `OpusBandwidth`, `OpusConfig`, and `detect_mode`. | +//! [`silk`] | SILK frame encode/decode: LPC residual + 16-bit quantisation. | +//! [`celt`] | CELT frame encode/decode: delegates to the psychoacoustic pipeline. | +//! [`stereo`] | M/S stereo codec [`OpusStereoCodec`]. | +//! [`codec`] | [`OpusCodec`] and [`OpusEncodedAudio`] — the public entry points. | +//! +//! ## Design +//! +//! - CELT mode **reuses** the existing [`crate::codecs::perceptual`] pipeline: +//! MDCT analysis → psychoacoustic masking → bit allocation → scalar +//! quantisation. The CELT-specific band layout ([`crate::BandLayout::celt`]) +//! uses the RFC 6716 band edges. +//! +//! - SILK mode provides **new** LPC primitives in [`lpc`] and combines them in +//! [`silk`] for frame-level encode/decode. +//! +//! - [`OpusCodec`] segments audio into fixed-length frames and dispatches each +//! to SILK or CELT based on the mode from [`OpusConfig`] or automatic +//! detection via [`detect_mode`]. +//! +//! ## What lives in the IO crate +//! +//! Bitstream packing (range coding, Ogg/RIFF containers) and `.opus` file I/O +//! are responsibilities of `audio_samples_io`. This module provides only the +//! algorithmic core: analysis, quantisation, and reconstruction. +//! +//! ## Known sketch limitations +//! +//! - **Hybrid mode** is defined but falls back to CELT. +//! - **Cross-frame LPC state** is not tracked; each SILK frame restarts from +//! zero initial state (introduces mild boundary artefacts for long signals). +//! - **No adaptive codebook** (pitch analysis/long-term prediction) for SILK. +//! - **No range/arithmetic coding** — residuals are stored as raw `Vec`. + +pub mod celt; +pub mod codec; +pub mod lpc; +pub mod mode; +pub mod silk; +pub mod stereo; + +pub use celt::{CeltEncodedFrame, celt_decode_frame, celt_encode_frame}; +pub use codec::{OpusCodec, OpusEncodedAudio, OpusEncodedFrame, OpusFrameData}; +pub use lpc::{ + LpcCoefficients, SILK_LPC_ORDER, compute_autocorrelation, levinson_durbin, lpc_analysis, + lpc_residual, lpc_synthesis, +}; +pub use mode::{OpusBandwidth, OpusConfig, OpusMode, detect_mode}; +pub use silk::{SilkEncodedFrame, silk_decode_frame, silk_encode_frame}; +pub use stereo::{OpusStereoCodec, OpusStereoEncodedAudio}; diff --git a/src/codecs/opus/mode.rs b/src/codecs/opus/mode.rs new file mode 100644 index 0000000..639316b --- /dev/null +++ b/src/codecs/opus/mode.rs @@ -0,0 +1,250 @@ +//! Opus operating mode, audio bandwidth, and codec configuration types. +//! +//! ## Modes +//! +//! Opus defines three operating modes (RFC 6716 §2): +//! +//! | Mode | Description | +//! |--------|-------------| +//! | `SILK` | LP codec for narrowband/wideband speech (Skype SILK). | +//! | `CELT` | MDCT codec for fullband music and generic audio (CELT/Xiph). | +//! | `Hybrid` | SILK for 0–8 kHz, CELT extension for 8–20 kHz. | +//! +//! The mode may be forced via [`OpusConfig::mode`] or set to `None` to trigger +//! automatic per-frame detection via [`detect_mode`]. +//! +//! ## Sketch note +//! +//! Hybrid mode is represented in the type system but falls back to CELT in the +//! current [`crate::codecs::opus::OpusCodec`] implementation. Full hybrid +//! encoding (split-band LP + MDCT) is a planned extension. + +// ── OpusMode ────────────────────────────────────────────────────────────────── + +/// Guard added to each sub-band energy before taking `ln()`, preventing `ln(0)` +/// for silent sub-bands. The value is negligible compared to any audible signal. +const LOG_GUARD_EPSILON: f64 = 1e-10; + +/// Operating mode of the Opus codec for a given audio frame. +/// +/// See the [module documentation](self) for a comparison of modes. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum OpusMode { + /// SILK speech codec: linear prediction coding. + /// + /// Best suited for narrowband and wideband speech signals. Uses an + /// order-16 LPC predictor and 16-bit residual quantisation. + Silk, + + /// CELT wideband audio codec: MDCT with psychoacoustic bit allocation. + /// + /// Best suited for fullband music, mixed content, and signals that resist + /// LP prediction (e.g. white noise). Reuses the existing + /// [`crate::codecs::perceptual`] pipeline. + Celt, + + /// Hybrid mode: SILK for the baseband (0–8 kHz), CELT extension (8–20 kHz). + /// + /// Used in super-wideband and fullband speech encoding. + /// + /// > **Sketch note**: hybrid encoding is not yet fully implemented. + /// > Frames detected as `Hybrid` are currently encoded with CELT. + Hybrid, +} + +// ── OpusBandwidth ───────────────────────────────────────────────────────────── + +/// Audio bandwidth of an Opus stream. +/// +/// Limits the highest reproduced frequency and constrains which modes are +/// applicable (RFC 6716 §2.1.2). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum OpusBandwidth { + /// Narrowband: 0–4 kHz. Applicable mode: SILK only. + NarrowBand, + /// Mediumband: 0–6 kHz. Applicable mode: SILK only. + MediumBand, + /// Wideband: 0–8 kHz. Applicable modes: SILK or CELT. + WideBand, + /// Super-wideband: 0–12 kHz. Applicable modes: Hybrid or CELT. + SuperWideBand, + /// Fullband: 0–20 kHz. Applicable modes: Hybrid or CELT. + FullBand, +} + +impl OpusBandwidth { + /// Returns the upper frequency limit in Hz for this bandwidth. + #[inline] + #[must_use] + pub const fn max_frequency_hz(self) -> f32 { + match self { + Self::NarrowBand => 4_000.0, + Self::MediumBand => 6_000.0, + Self::WideBand => 8_000.0, + Self::SuperWideBand => 12_000.0, + Self::FullBand => 20_000.0, + } + } + + /// Returns `true` if SILK is a valid mode for this bandwidth. + /// + /// SILK is only applicable up to and including [`OpusBandwidth::WideBand`]. + #[inline] + #[must_use] + pub const fn supports_silk(self) -> bool { + matches!(self, Self::NarrowBand | Self::MediumBand | Self::WideBand) + } + + /// Returns `true` if CELT is a valid mode for this bandwidth. + /// + /// CELT requires at least [`OpusBandwidth::WideBand`] to be useful. + #[inline] + #[must_use] + pub const fn supports_celt(self) -> bool { + !matches!(self, Self::NarrowBand | Self::MediumBand) + } +} + +// ── OpusConfig ──────────────────────────────────────────────────────────────── + +/// Configuration parameters for the Opus sketch codec. +/// +/// Passed to [`crate::codecs::opus::OpusCodec::new`] to control the operating +/// mode, bandwidth, bit budget, and frame length. +#[derive(Debug, Clone, PartialEq)] +pub struct OpusConfig { + /// Forced operating mode for all frames. + /// + /// When `None`, the mode is detected automatically per frame via + /// [`detect_mode`]. + pub mode: Option, + + /// Audio bandwidth constraint. + /// + /// Determines the maximum reproduced frequency and restricts applicable + /// modes. Use [`OpusBandwidth::FullBand`] for generic audio. + pub bandwidth: OpusBandwidth, + + /// Total bit budget per CELT frame, or the overall coding budget for SILK + /// residual quantisation. + pub bit_budget: u32, + + /// Duration of one audio frame in milliseconds. + /// + /// Valid Opus values are 2.5, 5, 10, 20, 40, and 60 ms. Other positive + /// values are accepted but may produce sub-optimal performance. Default: 20 ms. + pub frame_size_ms: f32, + + /// Minimum bits guaranteed to every CELT band. + /// + /// Passed directly to [`crate::codecs::perceptual::quantization::allocate_bits`]. + /// Typical value: `1`. + pub min_bits_per_band: u8, +} + +impl OpusConfig { + /// Creates an `OpusConfig` with automatic mode detection and fullband audio. + /// + /// Uses [`OpusBandwidth::FullBand`], a 20 ms frame, and `min_bits_per_band = 1`. + /// + /// # Arguments + /// - `bit_budget` – Bit budget per frame. + #[inline] + #[must_use] + pub fn new(bit_budget: u32) -> Self { + Self { + mode: None, + bandwidth: OpusBandwidth::FullBand, + bit_budget, + frame_size_ms: 20.0, + min_bits_per_band: 1, + } + } + + /// Creates an `OpusConfig` with a fixed operating mode. + /// + /// # Arguments + /// - `mode` – Operating mode forced for every frame. + /// - `bit_budget` – Bit budget per frame. + #[inline] + #[must_use] + pub fn with_mode(mode: OpusMode, bit_budget: u32) -> Self { + Self { + mode: Some(mode), + bandwidth: OpusBandwidth::FullBand, + bit_budget, + frame_size_ms: 20.0, + min_bits_per_band: 1, + } + } +} + +// ── Mode detection ──────────────────────────────────────────────────────────── + +/// Detects the appropriate Opus mode for an audio frame using the Spectral +/// Flatness Measure (SFM). +/// +/// ## Algorithm +/// +/// 1. Check bandwidth constraint: NarrowBand/MediumBand always → SILK; any +/// bandwidth that does not support SILK always → CELT. +/// 2. Compute the SFM of the frame's sub-band energy distribution: +/// `SFM = geometric_mean(band_energies) / arithmetic_mean(band_energies)`. +/// SFM ≈ 1 for flat/noise-like content; SFM ≈ 0 for tonal/speech content. +/// 3. Apply thresholds: +/// - SFM > 0.8 → [`OpusMode::Celt`] +/// - SFM < 0.4 → [`OpusMode::Silk`] +/// - Otherwise → [`OpusMode::Hybrid`] +/// +/// # Arguments +/// - `samples` – One audio frame (typically 20 ms of PCM, f32). +/// - `_sample_rate` – Sample rate in Hz (reserved for future use). +/// - `bandwidth` – Bandwidth constraint from [`OpusConfig`]. +#[must_use] +pub fn detect_mode(samples: &[f32], _sample_rate: u32, bandwidth: OpusBandwidth) -> OpusMode { + // Bandwidth hard-constraints. + if !bandwidth.supports_celt() { + return OpusMode::Silk; + } + if !bandwidth.supports_silk() { + return OpusMode::Celt; + } + + let n = samples.len(); + if n < 4 { + return OpusMode::Celt; + } + + // Split frame into 8 sub-bands and compute SFM over band energies. + const N_SUB_BANDS: usize = 8; + let band_size = n / N_SUB_BANDS; + if band_size == 0 { + return OpusMode::Celt; + } + + let band_energies: Vec = (0..N_SUB_BANDS) + .map(|b| { + let start = b * band_size; + let end = ((b + 1) * band_size).min(n); + samples[start..end] + .iter() + .map(|&x| (x as f64).powi(2)) + .sum::() + + LOG_GUARD_EPSILON // guard against log(0) + }) + .collect(); + + let arith_mean = band_energies.iter().sum::() / N_SUB_BANDS as f64; + let log_sum: f64 = band_energies.iter().map(|&e| e.ln()).sum::(); + let geom_mean = (log_sum / N_SUB_BANDS as f64).exp(); + + let sfm = (geom_mean / arith_mean) as f32; + + if sfm > 0.8 { + OpusMode::Celt + } else if sfm < 0.4 { + OpusMode::Silk + } else { + OpusMode::Hybrid + } +} diff --git a/src/codecs/opus/silk.rs b/src/codecs/opus/silk.rs new file mode 100644 index 0000000..5f7dd49 --- /dev/null +++ b/src/codecs/opus/silk.rs @@ -0,0 +1,172 @@ +//! SILK speech codec: LPC-based frame encode/decode for Opus's speech mode. +//! +//! ## What +//! +//! Implements one encode/decode cycle for a single Opus SILK audio frame using +//! the primitives from [`crate::codecs::opus::lpc`]: +//! +//! 1. **Encode** — LPC analysis → prediction residual → gain normalisation → +//! 16-bit residual quantisation. +//! 2. **Decode** — 16-bit dequantisation → scale by gain → LPC synthesis. +//! +//! ## Why +//! +//! SILK exploits the predictability of voiced speech: the LPC predictor removes +//! the spectral envelope, leaving a spectrally flat residual that is far smaller +//! in energy than the original. Quantising the residual at high resolution +//! (16 bits) achieves very high SNR for speech at low bitrates. +//! +//! For generic audio (music, noise) the LPC provides little prediction gain and +//! [`crate::codecs::opus::celt`] should be preferred via the auto-detection in +//! [`crate::codecs::opus::mode::detect_mode`]. +//! +//! ## Sketch limitations +//! +//! - Each frame is encoded independently with **zero initial LPC state**. +//! A complete implementation would carry the LPC filter state across frames to +//! prevent boundary artifacts. +//! - LPC coefficients are stored as `f32` values. Real SILK transmits Line +//! Spectral Frequency (LSF) parameters quantised to a codebook; the IO layer +//! (`audio_samples_io`) is responsible for that packing. +//! - No pitch analysis or adaptive codebook is implemented. + +use crate::{AudioSampleError, AudioSampleResult, ParameterError}; + +use super::lpc::{LpcCoefficients, SILK_LPC_ORDER, lpc_analysis, lpc_residual, lpc_synthesis}; + +// ── Constants ───────────────────────────────────────────────────────────────── + +/// Scale factor for 16-bit residual quantisation. +/// +/// The normalised residual (in `[−1, 1]`) is multiplied by this constant before +/// rounding to `i16`, and divided by the same constant during dequantisation. +const SILK_RESIDUAL_SCALE: f32 = 32_767.0; + +/// Minimum allowed gain value, used to prevent division by zero for silent frames. +/// +/// Corresponds to a residual peak amplitude of 10^-8 linear, which is well below +/// the noise floor of any practical audio system. +const MIN_GAIN_THRESHOLD: f32 = 1e-8; + +// ── SilkEncodedFrame ────────────────────────────────────────────────────────── + +/// One SILK-encoded audio frame. +/// +/// Stores the LPC coefficients, a 16-bit quantised prediction residual, and the +/// gain used to normalise the residual before quantisation. This struct is +/// self-contained: the decoder needs no external side-channel information. +/// +/// ## Round-trip quality +/// +/// The only error in the encode/decode round-trip is residual quantisation. +/// With `gain = max(|e[n]|)` the maximum per-sample error is `gain / 32767`. +/// For a 440 Hz sine at amplitude 0.5: +/// +/// - The LPC residual energy is close to floating-point noise (≈ 10⁻⁵). +/// - Gain ≈ 10⁻⁵, so maximum error ≈ 3 × 10⁻¹⁰. +/// - Expected SNR > 50 dB. +/// +/// For white noise the LPC provides no prediction gain (residual ≈ input), +/// but 16-bit quantisation still gives ≈ 90 dB dynamic range. +#[derive(Debug, Clone)] +pub struct SilkEncodedFrame { + /// LPC predictor coefficients and prediction error from Levinson–Durbin. + pub lpc_coeffs: LpcCoefficients, + /// Residual `e[n]` normalised to `[−1, 1]` and quantised to 16-bit integers. + pub residual_quantized: Vec, + /// Peak absolute value of the prediction residual before normalisation. + /// + /// The decoder multiplies the dequantised residual by this value to restore + /// the original amplitude scale. + pub gain: f32, + /// Number of PCM samples in the original frame. + pub n_samples: usize, +} + +// ── silk_encode_frame ───────────────────────────────────────────────────────── + +/// Encodes a single SILK audio frame. +/// +/// Steps: +/// 1. Compute an LPC predictor of order [`SILK_LPC_ORDER`] (or less for short frames). +/// 2. Apply the analysis filter to obtain the prediction residual. +/// 3. Compute `gain = max(|e[n]|)` and normalise: `e_norm[n] = e[n] / gain`. +/// 4. Quantise the normalised residual to 16-bit integers: +/// `q[n] = round(e_norm[n] × 32767)`. +/// +/// # Arguments +/// - `samples` – PCM samples for the frame (f32, any amplitude range). +/// +/// # Errors +/// Returns [`AudioSampleError::Parameter`] if `samples` is empty. +pub fn silk_encode_frame(samples: &[f32]) -> AudioSampleResult { + if samples.is_empty() { + return Err(AudioSampleError::Parameter(ParameterError::invalid_value( + "samples", + "SILK frame must contain at least one sample", + ))); + } + + let n_samples = samples.len(); + + // LPC analysis — order clamped for very short frames. + let lpc_coeffs = lpc_analysis(samples, SILK_LPC_ORDER); + + // Analysis filter → prediction residual. + let residual = lpc_residual(samples, &lpc_coeffs); + + // Gain = peak absolute residual (ensures normalised residual fits in [−1, 1]). + let gain = residual + .iter() + .copied() + .map(f32::abs) + .fold(0.0_f32, f32::max) + .max(MIN_GAIN_THRESHOLD); // prevent division by zero for silent frames + + // Quantise to i16. + let residual_quantized: Vec = residual + .iter() + .map(|&r| { + let scaled = r / gain * SILK_RESIDUAL_SCALE; + scaled + .round() + .clamp(-SILK_RESIDUAL_SCALE, SILK_RESIDUAL_SCALE) as i16 + }) + .collect(); + + Ok(SilkEncodedFrame { + lpc_coeffs, + residual_quantized, + gain, + n_samples, + }) +} + +// ── silk_decode_frame ───────────────────────────────────────────────────────── + +/// Decodes a SILK-encoded audio frame. +/// +/// Steps: +/// 1. Dequantise: `e_hat[n] = q[n] / 32767 × gain`. +/// 2. Apply the LPC synthesis filter: `y[n] = e_hat[n] − Σ a[k]·y[n−1−k]`. +/// +/// Both encoder and decoder use zero initial state, so the round-trip is exact +/// up to quantisation error (see [`SilkEncodedFrame`] for quality details). +/// +/// # Arguments +/// - `frame` – A SILK frame produced by [`silk_encode_frame`]. +/// +/// # Returns +/// A `Vec` of `frame.n_samples` reconstructed PCM samples. +#[must_use] +pub fn silk_decode_frame(frame: &SilkEncodedFrame) -> Vec { + // Dequantise residual. + let residual: Vec = frame + .residual_quantized + .iter() + .map(|&q| q as f32 / SILK_RESIDUAL_SCALE * frame.gain) + .collect(); + + // LPC synthesis filter. + lpc_synthesis(&residual, &frame.lpc_coeffs) +} diff --git a/src/codecs/opus/stereo.rs b/src/codecs/opus/stereo.rs new file mode 100644 index 0000000..dc51d9b --- /dev/null +++ b/src/codecs/opus/stereo.rs @@ -0,0 +1,230 @@ +//! Mid/Side stereo encoding for Opus: [`OpusStereoCodec`] and helpers. +//! +//! ## What +//! +//! Extends the mono [`OpusCodec`] to two-channel audio using the same +//! Mid/Side (M/S) matrix coding that [`crate::codecs::perceptual::stereo`] +//! applies to the perceptual codec: +//! +//! - **Mid** = `(L + R) / 2` — the common (mono-compatible) content. +//! - **Side** = `(L − R) / 2` — the stereo difference signal. +//! +//! Each channel is then compressed independently with its own [`OpusCodec`], +//! allowing separate bit budgets. The side channel is typically less energetic +//! and can receive fewer bits with minimal perceptual harm. +//! +//! ## Why +//! +//! Stereo audio is often highly correlated. M/S decorrelates the two channels, +//! concentrating the signal energy in the mid channel and leaving a low-energy +//! side channel. The side channel can then be coded at a lower bitrate. +//! +//! ## How +//! +//! ```rust,ignore +//! use audio_samples::codecs::{encode, decode}; +//! use audio_samples::codecs::opus::{OpusStereoCodec, OpusConfig, OpusMode}; +//! use spectrograms::WindowType; +//! +//! let mid_config = OpusConfig::with_mode(OpusMode::Celt, 96_000); +//! let side_config = OpusConfig::with_mode(OpusMode::Celt, 32_000); +//! let codec = OpusStereoCodec::new(mid_config, side_config, WindowType::Hanning); +//! +//! let encoded = encode(&stereo_audio, codec)?; +//! let recovered = decode::(encoded)?; +//! ``` + +use std::num::NonZeroU32; + +use non_empty_slice::NonEmptyVec; +use spectrograms::WindowType; + +use crate::codecs::perceptual::codec::AudioCodec; +use crate::codecs::perceptual::stereo::{mid_side_decode, mid_side_encode}; +use crate::traits::AudioTypeConversion; +use crate::{AudioSampleError, AudioSampleResult, AudioSamples, ParameterError, StandardSample}; + +use super::codec::{OpusCodec, OpusEncodedAudio}; +use super::mode::OpusConfig; + +// ── OpusStereoEncodedAudio ──────────────────────────────────────────────────── + +/// In-memory encoded audio produced by [`OpusStereoCodec`]. +/// +/// Holds independently encoded mid and side channels plus the original signal +/// metadata needed for reconstruction. +#[derive(Debug, Clone)] +pub struct OpusStereoEncodedAudio { + /// Encoded mid (sum) channel. + pub mid: OpusEncodedAudio, + /// Encoded side (difference) channel. + pub side: OpusEncodedAudio, + /// Total original signal length in samples per channel. + pub original_length: usize, + /// Sample rate of the original signal. + pub sample_rate: NonZeroU32, +} + +// ── OpusStereoCodec ─────────────────────────────────────────────────────────── + +/// A two-channel Opus codec using Mid/Side matrix coding. +/// +/// ## What +/// +/// Takes a stereo (`2 × N`) signal, applies the M/S transform to decorrelate +/// the channels, and codes mid and side independently with separate +/// [`OpusCodec`] instances. +/// +/// ## Invariants +/// +/// - Input audio must have exactly two channels. +/// - `mid_config` and `side_config` are applied to the respective mono signals +/// produced by the M/S transform. +#[derive(Debug, Clone)] +pub struct OpusStereoCodec { + /// Codec configuration for the mid (sum) channel. + pub mid_config: OpusConfig, + /// Codec configuration for the side (difference) channel. + pub side_config: OpusConfig, + /// MDCT window function shared by both channel codecs. + pub window: WindowType, +} + +impl OpusStereoCodec { + /// Creates an `OpusStereoCodec`. + /// + /// # Arguments + /// - `mid_config` – Configuration for the mid channel (common content). + /// Typically higher bitrate. + /// - `side_config` – Configuration for the side channel (difference signal). + /// Typically lower bitrate since the side is usually less energetic. + /// - `window` – MDCT window function for CELT frames on both channels. + #[inline] + #[must_use] + pub fn new(mid_config: OpusConfig, side_config: OpusConfig, window: WindowType) -> Self { + Self { + mid_config, + side_config, + window, + } + } + + /// Builds the mid-channel [`OpusCodec`]. + fn mid_codec(&self) -> OpusCodec { + OpusCodec::new(self.mid_config.clone(), self.window.clone()) + } + + /// Builds the side-channel [`OpusCodec`]. + fn side_codec(&self) -> OpusCodec { + OpusCodec::new(self.side_config.clone(), self.window.clone()) + } +} + +// ── AudioCodec impl ─────────────────────────────────────────────────────────── + +impl AudioCodec for OpusStereoCodec { + type Encoded = OpusStereoEncodedAudio; + + fn encode( + self, + audio: &AudioSamples, + ) -> AudioSampleResult { + if audio.num_channels().get() != 2 { + return Err(AudioSampleError::Parameter(ParameterError::invalid_value( + "audio", + format!( + "OpusStereoCodec requires exactly 2 channels, got {}", + audio.num_channels() + ), + ))); + } + + let sample_rate = audio.sample_rate(); + let original_length = audio.samples_per_channel().get(); + + // Extract L/R as f32. + let audio_f32 = audio.to_format::(); + let mut channels = audio_f32.channels(); + let left = channels + .next() + .expect("validated 2 channels") + .as_slice() + .expect("contiguous") + .to_vec(); + let right = channels + .next() + .expect("validated 2 channels") + .as_slice() + .expect("contiguous") + .to_vec(); + + // M/S matrix encode. + let (mid_samples, side_samples) = mid_side_encode(&left, &right); + + let mid_ne = NonEmptyVec::new(mid_samples).map_err(|_| AudioSampleError::EmptyData)?; + let side_ne = NonEmptyVec::new(side_samples).map_err(|_| AudioSampleError::EmptyData)?; + + let mid_audio: AudioSamples<'static, f32> = + AudioSamples::from_mono_vec(mid_ne, sample_rate); + let side_audio: AudioSamples<'static, f32> = + AudioSamples::from_mono_vec(side_ne, sample_rate); + + // Encode each channel independently. + let mid = self.mid_codec().encode(&mid_audio)?; + let side = self.side_codec().encode(&side_audio)?; + + Ok(OpusStereoEncodedAudio { + mid, + side, + original_length, + sample_rate, + }) + } + + fn decode( + encoded: Self::Encoded, + ) -> AudioSampleResult> + where + f32: crate::ConvertFrom, + { + let sample_rate = encoded.sample_rate; + let target_length = encoded.original_length; + + // Decode mid and side as mono f32. + let mid_f32 = OpusCodec::decode::(encoded.mid)?; + let side_f32 = OpusCodec::decode::(encoded.side)?; + + let mid_ch = mid_f32 + .channels() + .next() + .expect("OpusCodec::decode returns mono"); + let side_ch = side_f32 + .channels() + .next() + .expect("OpusCodec::decode returns mono"); + + let mid_samples = mid_ch.as_slice().expect("mono contiguous"); + let side_samples = side_ch.as_slice().expect("mono contiguous"); + + // M/S matrix decode → L/R. + let (left, right) = mid_side_decode(mid_samples, side_samples); + + // Interleave L/R → stereo AudioSamples, truncating to original length. + let interleaved: Vec = left + .iter() + .zip(right.iter()) + .take(target_length) + .flat_map(|(&l, &r)| [l, r]) + .collect(); + let interleaved_ne = + NonEmptyVec::new(interleaved).map_err(|_| AudioSampleError::EmptyData)?; + + let stereo_f32: AudioSamples<'static, f32> = AudioSamples::from_interleaved_vec( + interleaved_ne, + NonZeroU32::new(2).expect("2 > 0"), + sample_rate, + )?; + + Ok(stereo_f32.to_format::()) + } +} diff --git a/src/codecs/perceptual/bands.rs b/src/codecs/perceptual/bands.rs index 42afc19..ad5afc2 100644 --- a/src/codecs/perceptual/bands.rs +++ b/src/codecs/perceptual/bands.rs @@ -342,6 +342,88 @@ impl BandLayout { Self::new(bands_non_empty) } + /// Creates a CELT-style band layout using the Opus RFC 6716 band edges. + /// + /// Maps the 21 perceptual bands defined in RFC 6716 §4.3.1 onto the caller's + /// spectral-bin array. Bands whose lower edge falls at or above Nyquist are + /// omitted, so the returned layout will have **fewer than 21 bands** for + /// sample rates below 40 kHz. + /// + /// This band layout is the recommended choice for the CELT mode of + /// [`crate::codecs::opus::OpusCodec`]. + /// + /// `n_bins` is the number of spectral bins (typically `window_size / 2`). + /// + /// # Arguments + /// - `sample_rate_hz` – Sample rate of the audio signal in Hz. + /// - `n_bins` – Total number of spectral bins. + /// + /// # Returns + /// A `BandLayout` with up to 21 entries. + /// + /// # Examples + /// + /// ```rust + /// # #[cfg(feature = "psychoacoustic")] { + /// use audio_samples::BandLayout; + /// use std::num::NonZeroUsize; + /// + /// // 20 ms frame at 44.1 kHz → 882 samples → 441 bins + /// let layout = BandLayout::celt(44100.0, NonZeroUsize::new(441).unwrap()); + /// assert!(layout.len().get() <= 21); + /// for band in layout.as_slice().iter() { + /// assert!(band.end_bin > band.start_bin); + /// } + /// # } + /// ``` + #[must_use] + pub fn celt(sample_rate_hz: f32, n_bins: NonZeroUsize) -> Self { + /// CELT band boundary frequencies in Hz (RFC 6716 §4.3.1, Table 2). + /// + /// 22 edge values define 21 bands: + /// `[0–200), [200–400), …, [15600–20000)`. + const CELT_BAND_EDGES_HZ: &[f32] = &[ + 0.0, 200.0, 400.0, 600.0, 800.0, 1000.0, 1200.0, 1400.0, 1600.0, + 2000.0, 2400.0, 2800.0, 3200.0, 4000.0, 4800.0, 5600.0, 6800.0, + 8000.0, 9600.0, 12000.0, 15600.0, 20000.0, + ]; + + let nyquist = sample_rate_hz / 2.0; + let bins = n_bins.get(); + let n_edges = CELT_BAND_EDGES_HZ.len(); + + let mut bands: Vec = Vec::with_capacity(n_edges - 1); + + for i in 0..n_edges - 1 { + let low_hz = CELT_BAND_EDGES_HZ[i]; + if low_hz >= nyquist { + break; + } + let high_hz = CELT_BAND_EDGES_HZ[i + 1].min(nyquist); + let centre_hz = (low_hz + high_hz) / 2.0; + + let start_bin = hz_to_bin(low_hz, sample_rate_hz, bins); + let end_bin_raw = hz_to_bin(high_hz, sample_rate_hz, bins); + let end_bin = end_bin_raw.max(start_bin + 1).min(bins); + + // Normalised position: linear index in [0, 1]. + let perceptual_position = i as f32 / (n_edges - 2) as f32; + + // SAFETY: end_bin > start_bin is enforced by .max(start_bin + 1) above. + bands.push(unsafe { Band::new(start_bin, end_bin, centre_hz, perceptual_position) }); + } + + // Guarantee at least one band even for very low sample rates. + if bands.is_empty() { + // SAFETY: end_bin = 1 > 0 = start_bin. + bands.push(unsafe { Band::new(0, 1, nyquist / 2.0, 0.0) }); + } + + // SAFETY: bands is non-empty (guarded by the fallback above). + let bands_ne = unsafe { NonEmptySlice::new_unchecked(&bands) }; + Self::new(bands_ne) + } + /// Creates an ERB-scale band layout. /// /// Divides the spectrum from 0 Hz to Nyquist into `n_bands` bands spaced diff --git a/src/lib.rs b/src/lib.rs index cb08818..9333ff5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -478,6 +478,17 @@ pub use crate::codecs::perceptual::quantization::{ #[cfg(feature = "psychoacoustic")] pub use crate::codecs::perceptual::codec::{decode as codec_decode, encode as codec_encode}; +// ── Opus codec re-exports ───────────────────────────────────────────────────── + +#[cfg(feature = "opus-codec")] +pub use crate::codecs::opus::{ + CeltEncodedFrame, OpusBandwidth, OpusCodec, OpusConfig, OpusEncodedAudio, OpusEncodedFrame, + OpusFrameData, OpusMode, OpusStereoCodec, OpusStereoEncodedAudio, SilkEncodedFrame, + celt_decode_frame, celt_encode_frame, detect_mode as detect_opus_mode, lpc_analysis, + lpc_residual, lpc_synthesis, levinson_durbin, silk_decode_frame, silk_encode_frame, + LpcCoefficients, SILK_LPC_ORDER, +}; + #[cfg(feature = "random-generation")] pub use crate::utils::generation::{brown_noise, pink_noise, white_noise}; diff --git a/tests/opus_codec_quality.rs b/tests/opus_codec_quality.rs new file mode 100644 index 0000000..6f1c3fb --- /dev/null +++ b/tests/opus_codec_quality.rs @@ -0,0 +1,274 @@ +/// Round-trip quality tests for the Opus sketch codec. +/// +/// Each test encodes a signal with a fixed mode (SILK or CELT), decodes it, +/// and measures the SNR between original and reconstructed audio. +/// +/// ## Quality notes +/// +/// - **SILK** achieves high SNR (>30 dB) for both tonal and noise signals because +/// it uses a time-domain LPC analysis/synthesis that doesn't suffer from +/// spectral reconstruction artefacts. +/// +/// - **CELT** delegates to the existing [`PerceptualCodec`] pipeline (MDCT + +/// psychoacoustic masking). The quality is limited by the same MDCT boundary +/// effect that affects [`PerceptualCodec`]: the Hanning-windowed MDCT produces +/// near-zero samples at the beginning and end of each frame, which caps the +/// achievable SNR at approximately 2–3 dB for typical signals. This matches +/// the behaviour observed in the existing `codec_quality` integration tests. +/// +/// The thresholds below therefore represent the floor that the **sketch** +/// implementation is expected to meet consistently, not the target for a +/// fully-optimised production codec. +/// +/// ## Run +/// +/// ```bash +/// cargo test --test opus_codec_quality \ +/// --no-default-features \ +/// --features "opus-codec,random-generation" +/// ``` +use std::time::Duration; + +use audio_samples::{ + OpusCodec, OpusConfig, OpusMode, + codecs::{decode, encode}, + sample_rate, + utils::generation::{sine_wave, white_noise}, + utils::comparison::snr, +}; +use non_empty_slice::NonEmptyVec; +use spectrograms::WindowType; + +const SIGNAL_DURATION_MS: u64 = 200; + +// ── Codec constructors ──────────────────────────────────────────────────────── + +fn silk_codec(bit_budget: u32) -> OpusCodec { + let config = OpusConfig::with_mode(OpusMode::Silk, bit_budget); + OpusCodec::new(config, WindowType::Hanning) +} + +fn celt_codec(bit_budget: u32) -> OpusCodec { + let config = OpusConfig::with_mode(OpusMode::Celt, bit_budget); + OpusCodec::new(config, WindowType::Hanning) +} + +fn auto_codec(bit_budget: u32) -> OpusCodec { + let config = OpusConfig::new(bit_budget); + OpusCodec::new(config, WindowType::Hanning) +} + +// ── Test signals ────────────────────────────────────────────────────────────── + +fn signal_sine(freq_hz: f64) -> audio_samples::AudioSamples<'static, f32> { + sine_wave::( + freq_hz, + Duration::from_millis(SIGNAL_DURATION_MS), + sample_rate!(44100), + 0.5, + ) +} + +fn signal_white_noise() -> audio_samples::AudioSamples<'static, f32> { + white_noise::( + Duration::from_millis(SIGNAL_DURATION_MS), + sample_rate!(44100), + 0.5, + Some(42), + ) +} + +// ── Round-trip helper ───────────────────────────────────────────────────────── + +/// Encodes → decodes and returns the SNR in dB. +fn round_trip_snr( + signal: &audio_samples::AudioSamples<'static, f32>, + codec: OpusCodec, +) -> f64 { + let encoded = encode(signal, codec).expect("encode failed"); + let reconstructed = decode::(encoded).expect("decode failed"); + + let orig = signal.as_slice().expect("original contiguous"); + let recon = reconstructed.as_slice().expect("reconstructed contiguous"); + let len = orig.len().min(recon.len()); + + let error: Vec = orig[..len] + .iter() + .zip(recon[..len].iter()) + .map(|(a, b)| a - b) + .collect(); + + // SAFETY: `len = orig.len().min(recon.len())`. Both signals are the output + // of a successful encode/decode, which requires a non-empty input, so + // orig.len() >= 1 and recon.len() >= 1, therefore len >= 1 and `error` is + // non-empty. + debug_assert!(!error.is_empty(), "error vector must be non-empty"); + let nev = unsafe { NonEmptyVec::new_unchecked(error) }; + let error_sig = + audio_samples::AudioSamples::<'static, f32>::from_mono_vec(nev, signal.sample_rate()); + + let orig_trimmed = if len == orig.len() { + signal.clone() + } else { + let trimmed: Vec = orig[..len].to_vec(); + // SAFETY: len >= 1 (same guarantee as error above), so trimmed is non-empty. + debug_assert!(!trimmed.is_empty(), "trimmed vector must be non-empty"); + let nev2 = unsafe { NonEmptyVec::new_unchecked(trimmed) }; + audio_samples::AudioSamples::<'static, f32>::from_mono_vec(nev2, signal.sample_rate()) + }; + + snr(&orig_trimmed, &error_sig).expect("snr failed") +} + +// ── SILK mode tests ─────────────────────────────────────────────────────────── + +/// SILK achieves high SNR because it uses a time-domain LPC round-trip +/// that doesn't have the MDCT window boundary limitation. +#[test] +fn silk_sine_440() { + let snr_db = round_trip_snr(&signal_sine(440.0), silk_codec(64_000)); + eprintln!("SILK sine_440 @ 64k: {snr_db:.2} dB"); + assert!(snr_db > 20.0, "SILK SNR {snr_db:.2} dB below floor of 20 dB"); +} + +#[test] +fn silk_sine_1k() { + let snr_db = round_trip_snr(&signal_sine(1000.0), silk_codec(64_000)); + eprintln!("SILK sine_1k @ 64k: {snr_db:.2} dB"); + assert!(snr_db > 20.0, "SILK SNR {snr_db:.2} dB below floor of 20 dB"); +} + +#[test] +fn silk_white_noise() { + let snr_db = round_trip_snr(&signal_white_noise(), silk_codec(64_000)); + eprintln!("SILK white_noise @ 64k: {snr_db:.2} dB"); + assert!(snr_db > 20.0, "SILK SNR {snr_db:.2} dB below floor of 20 dB"); +} + +// ── CELT mode tests ─────────────────────────────────────────────────────────── +// +// CELT quality is bounded by the MDCT window boundary effect. Each 20 ms +// Opus frame is encoded as a single MDCT window, so the first and last ~hop +// samples of every frame are windowed to near-zero. This is the same +// limitation as `PerceptualCodec` (see codec_quality.rs). Thresholds are +// set to 1.5 dB below the minimum actually observed during development to +// provide a stable regression floor. + +#[test] +fn celt_sine_440_64k() { + let snr_db = round_trip_snr(&signal_sine(440.0), celt_codec(64_000)); + eprintln!("CELT sine_440 @ 64k: {snr_db:.2} dB"); + assert!(snr_db > 1.0, "CELT SNR {snr_db:.2} dB below floor of 1 dB"); +} + +#[test] +fn celt_sine_440_128k() { + let snr_db = round_trip_snr(&signal_sine(440.0), celt_codec(128_000)); + eprintln!("CELT sine_440 @ 128k: {snr_db:.2} dB"); + assert!(snr_db > 1.0, "CELT SNR {snr_db:.2} dB below floor of 1 dB"); +} + +#[test] +fn celt_sine_1k_64k() { + let snr_db = round_trip_snr(&signal_sine(1000.0), celt_codec(64_000)); + eprintln!("CELT sine_1k @ 64k: {snr_db:.2} dB"); + assert!(snr_db > 1.0, "CELT SNR {snr_db:.2} dB below floor of 1 dB"); +} + +#[test] +fn celt_white_noise_64k() { + let snr_db = round_trip_snr(&signal_white_noise(), celt_codec(64_000)); + eprintln!("CELT white_noise @ 64k: {snr_db:.2} dB"); + // White noise through MDCT has near-zero SNR due to boundary effects. + assert!(snr_db > -1.0, "CELT SNR {snr_db:.2} dB below floor of -1 dB"); +} + +// ── Auto-mode detection tests ───────────────────────────────────────────────── + +#[test] +fn auto_mode_sine_440() { + let snr_db = round_trip_snr(&signal_sine(440.0), auto_codec(128_000)); + eprintln!("Auto sine_440 @ 128k: {snr_db:.2} dB"); + assert!(snr_db > 1.0, "auto-mode SNR {snr_db:.2} dB below floor of 1 dB"); +} + +#[test] +fn auto_mode_white_noise() { + let snr_db = round_trip_snr(&signal_white_noise(), auto_codec(128_000)); + eprintln!("Auto white_noise @ 128k: {snr_db:.2} dB"); + assert!(snr_db > -1.0, "auto-mode SNR {snr_db:.2} dB below floor of -1 dB"); +} + +// ── BandLayout::celt sanity test ────────────────────────────────────────────── + +#[test] +fn celt_band_layout_sanity() { + use audio_samples::BandLayout; + use std::num::NonZeroUsize; + + // 20 ms at 44.1 kHz = 882 samples → 441 MDCT bins. + let n_bins = NonZeroUsize::new(441).unwrap(); + let layout = BandLayout::celt(44100.0, n_bins); + + // Should have between 1 and 21 bands. + assert!(layout.len().get() >= 1); + assert!(layout.len().get() <= 21); + + // All bands must respect end_bin > start_bin. + for band in layout.as_slice().iter() { + assert!( + band.end_bin > band.start_bin, + "band [{}, {}) is empty", + band.start_bin, + band.end_bin, + ); + assert!(band.end_bin <= n_bins.get(), "band end_bin out of range"); + } +} + +#[test] +fn celt_band_layout_low_sample_rate() { + use audio_samples::BandLayout; + use std::num::NonZeroUsize; + + // 8 kHz narrowband — only bands up to 4 kHz should appear. + let n_bins = NonZeroUsize::new(80).unwrap(); + let layout = BandLayout::celt(8000.0, n_bins); + assert!(layout.len().get() >= 1, "must have at least one band"); + for band in layout.as_slice().iter() { + assert!(band.end_bin > band.start_bin); + assert!(band.end_bin <= n_bins.get()); + } +} + +// ── SILK LPC round-trip unit test ───────────────────────────────────────────── + +/// Confirms that the SILK encode/decode pair is a near-perfect round trip. +/// This isolates the LPC primitives from the full OpusCodec stack. +#[test] +fn silk_lpc_round_trip() { + use audio_samples::{silk_encode_frame, silk_decode_frame}; + + let samples: Vec = (0..128) + .map(|i| (2.0 * std::f32::consts::PI * 440.0 * i as f32 / 44100.0).sin() * 0.5) + .collect(); + + let frame = silk_encode_frame(&samples).expect("encode"); + let recovered = silk_decode_frame(&frame); + + assert_eq!(recovered.len(), samples.len()); + + let signal_power: f32 = samples.iter().map(|&x| x * x).sum::() / samples.len() as f32; + let error_power: f32 = samples + .iter() + .zip(recovered.iter()) + .map(|(&a, &b)| (a - b).powi(2)) + .sum::() + / samples.len() as f32; + let snr_db = 10.0 * (signal_power / error_power.max(1e-15)).log10(); + + assert!( + snr_db > 30.0, + "SILK LPC round-trip SNR {snr_db:.2} dB below floor of 30 dB" + ); +}