mbelib_rs/encode/
vuv.rs

1// SPDX-FileCopyrightText: 2009 Pavel Yazev (OP25 imbe_vocoder/v_uv_det.cc)
2// SPDX-FileCopyrightText: 2026 Swift Raccoon
3// SPDX-License-Identifier: GPL-2.0-or-later OR GPL-3.0-or-later
4
5//! Per-band voiced/unvoiced decisions + integrated spectral amplitude
6//! extraction.
7//!
8//! Port of OP25's `imbe_vocoder::v_uv_det`. For each harmonic in the
9//! `±f0/2` analysis window, fit a weighted sinusoid via the spectral
10//! response window [`WR_SP`](crate::encode::wr_sp::WR_SP), then decide
11//! voicing per band from the reconstruction-error-to-energy ratio
12//! `Dk = D_num / D_den`. Bands group 3 adjacent harmonics except the
13//! final band which takes the remainder.
14//!
15//! # Differences from a bare V/UV
16//!
17//! The OP25 algorithm couples V/UV with spectral amplitude
18//! extraction: the same sinusoidal-fit pass that produces the
19//! per-bin fitted amplitude feeds into both the `Dk` ratio (V/UV
20//! output) and the per-harmonic SA output. Re-running extraction in
21//! a separate pass using centre-bin integration (our old
22//! [`extract_spectral_amplitudes`](crate::encode::extract_spectral_amplitudes))
23//! produces different numerical SAs because the 3-bin-power
24//! integration doesn't account for the Hamming spectral lobe the
25//! analysis window imparts on each harmonic.
26//!
27//! # State carried across frames
28//!
29//! [`VuvState`] holds the `v_uv_dsn` hysteresis array (previous
30//! frame's decision per band) and the ``th_max`` sliding frame-quality
31//! maximum. Both are part of OP25's steady-state behaviour:
32//!
33//! - Hysteresis: a band that was voiced last frame has a lower bar
34//!   to stay voiced (threshold 0.5625 · `M_fcn` − …). An unvoiced band
35//!   has a higher bar to become voiced (threshold 0.45 · `M_fcn` − …).
36//!   Prevents per-frame V/UV ping-pong on borderline signals.
37//!
38//! - `th_max`: slow-update ceiling of total frame energy. When the
39//!   current frame's total energy `th0 > th_max`, `th_max` jumps to
40//!   the midpoint; otherwise it decays at 0.99 per frame. The
41//!   V/UV threshold is scaled by `M_fcn = (th0 + 0.0025·th_max) /
42//!   (th0 + 0.01·th_max)`, which is near 1 for loud frames and
43//!   approaches 0.25 for very quiet ones. Quiet frames are thus
44//!   pushed toward unvoiced (the simpler, noisier synthesis mode).
45
46use realfft::num_complex::Complex;
47
48use crate::encode::spectral::{MAX_HARMONICS, SpectralAmplitudes};
49use crate::encode::wr_sp::{WR_SP, WR_SP_CENTER};
50
51/// Maximum number of V/UV bands (12 per AMBE spec).
52pub const MAX_BANDS: usize = 12;
53
54/// Harmonics per band (OP25 `NUM_HARMS_PER_BAND`).
55const HARMS_PER_BAND: usize = 3;
56
57/// Per-stream V/UV state carried across frames.
58#[derive(Debug, Clone, Copy)]
59pub struct VuvState {
60    /// Previous-frame V/UV decision per band. Used to bias the
61    /// current-frame threshold: a band that was voiced is easier to
62    /// keep voiced (hysteresis).
63    prev_voiced: [bool; MAX_BANDS],
64    /// Slow-update frame-energy ceiling used to compute `M_fcn`, the
65    /// per-frame threshold multiplier. Approaches the peak frame
66    /// energy seen so far; decays at 0.99 per frame when the current
67    /// frame is below the peak.
68    th_max: f32,
69}
70
71impl VuvState {
72    /// Fresh state — no hysteresis, `th_max` starts at 0.
73    #[must_use]
74    pub const fn new() -> Self {
75        Self {
76            prev_voiced: [false; MAX_BANDS],
77            th_max: 0.0,
78        }
79    }
80}
81
82impl Default for VuvState {
83    fn default() -> Self {
84        Self::new()
85    }
86}
87
88/// Per-frame V/UV decision vector.
89#[derive(Debug, Clone, Copy)]
90pub struct VuvDecisions {
91    /// `true` = voiced band (periodic content dominates), `false` =
92    /// unvoiced (noise-like). Only the first `num_bands` entries
93    /// are valid; the rest are padding zeros.
94    pub voiced: [bool; MAX_BANDS],
95    /// Number of active harmonic bands for this frame (derived from
96    /// the pitch).
97    pub num_bands: usize,
98}
99
100/// Stateless one-shot V/UV convenience wrapper.
101///
102/// Builds a fresh [`VuvState`] internally and calls
103/// [`detect_vuv_and_sa`] with a neutral `e_p = 0.5`. Discards the
104/// spectral amplitudes. Exposed for validators and tests that want a
105/// simple "classify this spectrum" call; the encoder itself uses
106/// [`detect_vuv_and_sa`] directly so that hysteresis + ``th_max`` carry
107/// across frames.
108#[must_use]
109#[allow(
110    clippy::cast_possible_truncation,
111    clippy::cast_sign_loss,
112    clippy::cast_precision_loss,
113    reason = "DSP bin math; inputs bounded by FFT length"
114)]
115pub fn detect_vuv(fft_out: &[Complex<f32>], f0_bin: f32) -> VuvDecisions {
116    let mut state = VuvState::new();
117    let (vuv, _sa) = detect_vuv_and_sa(fft_out, f0_bin, &mut state, 0.5);
118    vuv
119}
120
121/// Integrated V/UV + spectral amplitude extraction.
122///
123/// Ports OP25's `v_uv_det`. For each harmonic `k ∈ 1..=num_harms`,
124/// extracts the analysis window `[k·f0 − f0/2, k·f0 + f0/2]` from
125/// the FFT, fits a windowed sinusoid, and accumulates error and
126/// energy. Every `HARMS_PER_BAND` harmonics, commits a band decision
127/// based on `Dk = D_num / D_den < dsn_thr`.
128///
129/// `e_p` is the current-frame pitch-error metric (output of the
130/// pitch tracker); passing a large value (`> 0.55`) disables voicing
131/// in all but the first band — the pitch quality is too low to trust
132/// the harmonic model.
133#[must_use]
134#[allow(
135    clippy::cast_possible_truncation,
136    clippy::cast_sign_loss,
137    clippy::cast_precision_loss,
138    clippy::too_many_lines,
139    reason = "Mirrors OP25's v_uv_det block so the port can be cross-referenced line-by-line"
140)]
141pub fn detect_vuv_and_sa(
142    fft_out: &[Complex<f32>],
143    f0_bin: f32,
144    state: &mut VuvState,
145    e_p: f32,
146) -> (VuvDecisions, SpectralAmplitudes) {
147    // Derive num_harms / num_bands from pitch. OP25:
148    //   num_harms = min(max(int((pitch/2 + 0.5) · 0.9254), NUM_HARMS_MIN), NUM_HARMS_MAX)
149    // In our bin-domain, pitch_period_samples = 256 / f0_bin.
150    let period = if f0_bin > 0.5 { 256.0 / f0_bin } else { 256.0 };
151    // OP25 computes `num_harms` via TWO successive integer
152    // truncations (`v_uv_det.cc:117-118`):
153    //
154    //   tmp       = shr(add(shr(ref_pitch, 1), CNST_0_25_Q8_8), 8)
155    //             = int_part(period / 2 + 0.25)
156    //   num_harms = extract_h(tmp * CNST_0_9254_Q0_16)
157    //             = int_part(tmp * 0.9254)
158    //
159    // The inner truncation matters: a pure
160    // `floor((period/2 + 0.5) * 0.9254)` (our previous formula) is
161    // off-by-one on many mid-range pitches — e.g. `period = 54.75`
162    // gives `(27.375 + 0.5) * 0.9254 = 25.8 → 25`, while OP25's
163    // pathway gives `floor(27.375 + 0.25) * 0.9254 = 27 * 0.9254 =
164    // 24.98 → 24`. That 1-harmonic disagreement shifts
165    // `L_TABLE`-indexed `b0` selection and cascades into b1/b2/b3
166    // quantizer searches. Matched exactly here.
167    let num_harms: usize = {
168        #[allow(
169            clippy::cast_possible_truncation,
170            clippy::cast_sign_loss,
171            reason = "period ∈ (0, 1024); period/2 + 0.25 < 513; truncating to usize is exact"
172        )]
173        let tmp = (period * 0.5 + 0.25) as usize;
174        #[allow(clippy::cast_precision_loss, reason = "tmp ≤ 513; exact in f32")]
175        let raw = tmp as f32 * 0.9254;
176        let clamped = raw.max(9.0).min(MAX_HARMONICS as f32);
177        #[allow(
178            clippy::cast_possible_truncation,
179            clippy::cast_sign_loss,
180            reason = "clamped above to [9.0, MAX_HARMONICS]; truncation is exact"
181        )]
182        let n = clamped as usize;
183        n
184    };
185    let num_bands = if num_harms <= 36 {
186        ((num_harms + 2) / HARMS_PER_BAND).min(MAX_BANDS)
187    } else {
188        MAX_BANDS
189    };
190
191    // ── Frame-level quality metric `M_fcn` ──────────────────────
192    //
193    // `energy_low` = low-half-spectrum energy, `energy_high` =
194    // high-half, `energy_total` = sum. Slow-update `th_max` so
195    // `M_fcn` reflects "this frame vs recent loudest". OP25's
196    // thresholds match int16 signal scale; we operate on normalized
197    // f32 [-1, 1), so the numerical magnitudes are tiny (~1e-4 for
198    // speech). `M_fcn`'s ratio form keeps it in [~0.25, ~1.0]
199    // regardless.
200    let bin_half = fft_out.len() / 2;
201    let energy_low: f32 = fft_out
202        .iter()
203        .take(bin_half)
204        .map(Complex::<f32>::norm_sqr)
205        .sum();
206    let energy_high: f32 = fft_out
207        .iter()
208        .skip(bin_half)
209        .map(Complex::<f32>::norm_sqr)
210        .sum();
211    let energy_total = energy_low + energy_high;
212    state.th_max = if energy_total > state.th_max {
213        0.5 * (state.th_max + energy_total)
214    } else {
215        0.99_f32.mul_add(state.th_max, 0.01 * energy_total)
216    };
217    let mut m_fcn = {
218        let num = 0.0025_f32.mul_add(state.th_max, energy_total);
219        let den = 0.01_f32.mul_add(state.th_max, energy_total);
220        if den < 1e-30 { 0.25 } else { num / den }
221    };
222    // If low-frequency energy is much smaller than high-frequency,
223    // dampen `M_fcn` (the signal lacks the low-F structure typical of
224    // speech, so the voicing decision needs more evidence).
225    let hi5 = 5.0 * energy_high;
226    if energy_low < hi5 && hi5 > 1e-30 {
227        m_fcn *= (energy_low / hi5).sqrt();
228    }
229
230    // ── Per-harmonic sinusoidal fit ────────────────────────────
231    let mut sa = [0.0_f32; MAX_HARMONICS];
232    // Cache the sc_coef and window energy per harmonic so the final
233    // SA calculation can reuse them after the band decision lands.
234    let mut m_num = [0.0_f32; MAX_HARMONICS]; // observed window energy
235    let mut sc_coef = [0.0_f32; MAX_HARMONICS]; // 1 / Σ wr_sp² for this harmonic
236    let mut bin_counts = [0_usize; MAX_HARMONICS]; // bins used per harmonic
237
238    let mut voiced = [false; MAX_BANDS];
239    let mut d_num = 0.0_f32;
240    let mut d_den = 0.0_f32;
241    let mut band_cnt = 0_usize;
242    let mut num_harms_cnt = 0_usize;
243    let mut dsn_thr = 0.0_f32;
244    // Cumulative band-center frequency (scaled to bin_half span). Used
245    // to damp thresholds at high-frequency bands where harmonic
246    // energy is typically lower.
247    let mut band_center_norm = 0.0_f32;
248    let band_center_step = f0_bin * HARMS_PER_BAND as f32 / (bin_half as f32);
249
250    for k in 0..num_harms {
251        let center = (k as f32 + 1.0) * f0_bin;
252        let half_win = f0_bin * 0.5;
253        let bin_lo = (center - half_win).ceil().max(0.0) as usize;
254        let bin_hi = ((center + half_win).ceil() as usize).min(fft_out.len());
255        if bin_lo >= bin_hi {
256            continue;
257        }
258        bin_counts[k] = bin_hi - bin_lo;
259
260        // Compute per-band threshold once per band.
261        if num_harms_cnt == 0 {
262            dsn_thr = if e_p > 0.55 && band_cnt >= 1 {
263                0.0
264            } else if state.prev_voiced[band_cnt] {
265                (-0.1741_f32).mul_add(band_center_norm, 0.5625) * m_fcn
266            } else {
267                (-0.1393_f32).mul_add(band_center_norm, 0.45) * m_fcn
268            };
269            band_center_norm += band_center_step;
270        }
271
272        // Pass 1: windowed-sinusoid fit.
273        let mut amp_re = 0.0_f32;
274        let mut amp_im = 0.0_f32;
275        let mut m_den_sum = 0.0_f32;
276        for bin in bin_lo..bin_hi {
277            let w = wr_sp_sample(bin, center);
278            let c = fft_out
279                .get(bin)
280                .copied()
281                .unwrap_or_else(|| Complex::new(0.0, 0.0));
282            amp_re = c.re.mul_add(w, amp_re);
283            amp_im = c.im.mul_add(w, amp_im);
284            m_den_sum = w.mul_add(w, m_den_sum);
285        }
286        let sc = if m_den_sum > 1e-12 {
287            1.0 / m_den_sum
288        } else {
289            0.0
290        };
291        sc_coef[k] = sc;
292        let fit_re = amp_re * sc;
293        let fit_im = amp_im * sc;
294
295        // Pass 2: error + energy accumulation.
296        let mut m_num_sum = 0.0_f32;
297        for bin in bin_lo..bin_hi {
298            let w = wr_sp_sample(bin, center);
299            let rec_re = fit_re * w;
300            let rec_im = fit_im * w;
301            let c = fft_out
302                .get(bin)
303                .copied()
304                .unwrap_or_else(|| Complex::new(0.0, 0.0));
305            let err_re = c.re - rec_re;
306            let err_im = c.im - rec_im;
307            d_num += err_re.mul_add(err_re, err_im * err_im);
308            m_num_sum += c.norm_sqr();
309        }
310        m_num[k] = m_num_sum;
311        d_den += m_num_sum;
312
313        // Commit band every HARMS_PER_BAND harmonics (except last).
314        num_harms_cnt += 1;
315        let last_harmonic = k + 1 == num_harms;
316        let full_band = num_harms_cnt == HARMS_PER_BAND && band_cnt < num_bands - 1;
317        let commit = full_band || last_harmonic;
318        if commit {
319            let dk = if d_den > 1e-12 { d_num / d_den } else { 1.0 };
320            let band_voiced = dk < dsn_thr;
321            voiced[band_cnt] = band_voiced;
322
323            // Emit SA for the harmonics that make up this band.
324            let first_k = (k + 1) - num_harms_cnt;
325            for kk in first_k..=k {
326                sa[kk] = if band_voiced {
327                    voiced_sa_calc(m_num[kk], sc_coef[kk])
328                } else {
329                    unvoiced_sa_calc(m_num[kk], bin_counts[kk])
330                };
331            }
332
333            d_num = 0.0;
334            d_den = 0.0;
335            num_harms_cnt = 0;
336            band_cnt += 1;
337        }
338    }
339
340    state.prev_voiced = voiced;
341    let vuv = VuvDecisions { voiced, num_bands };
342    let amps = SpectralAmplitudes {
343        magnitudes: sa,
344        num_harmonics: num_harms,
345    };
346    (vuv, amps)
347}
348
349/// Look up `WR_SP[round(WR_SP_CENTER + (bin − harmonic_center) · 64)]`
350/// with bounds-check, returning `0.0` outside the table. Centralises
351/// the f32→index arithmetic + sign/precision-loss justification for
352/// every site that samples the spectral-response window.
353#[inline]
354fn wr_sp_sample(bin: usize, harmonic_center: f32) -> f32 {
355    // `bin` < 129 for a 256-pt real FFT, so `bin as f32` is lossless.
356    // `harmonic_center` ∈ (0, 128) for any valid pitch. The `.round()`
357    // output is therefore in `[WR_SP_CENTER − 64·128, WR_SP_CENTER +
358    // 64·128] ⊂ i32`. Bounds check against `WR_SP_LEN` happens
359    // before indexing; out-of-range returns zero.
360    #[allow(
361        clippy::cast_precision_loss,
362        reason = "bin ≤ 128 and WR_SP_CENTER = 160; both fit exactly in f32"
363    )]
364    let offset_raw = 64.0_f32.mul_add(bin as f32 - harmonic_center, WR_SP_CENTER as f32);
365    let offset = offset_raw.round();
366    if !offset.is_finite() || offset < 0.0 {
367        return 0.0;
368    }
369    #[allow(
370        clippy::cast_possible_truncation,
371        clippy::cast_sign_loss,
372        reason = "Checked above: offset is finite and ≥ 0; truncation is exact for the integer result of `.round()` on a bounded small value"
373    )]
374    let idx = offset as usize;
375    WR_SP.get(idx).copied().unwrap_or(0.0)
376}
377
378/// Voiced spectral amplitude: square root of observed energy
379/// normalized by the window-self-energy. Float port of OP25's
380/// `voiced_sa_calc(M_num, M_den)` where `M_den` is `sc_coef =
381/// 1 / Σ wr_sp²`.
382#[inline]
383fn voiced_sa_calc(m_num: f32, sc: f32) -> f32 {
384    // OP25 comment: `2 * 256 * sqrt(2 * num / den)` — but den there
385    // is `sc_coef = 1/M_den_sum`, so `2*num/den = 2*num*M_den_sum`.
386    // In float that's `sqrt(2 · m_num · sc⁻¹) · 512`. For unit
387    // harmonic amplitude (the fitted-sinusoid case) this reduces
388    // algebraically to `fitted_amplitude · √(m_den_sum) · 512`.
389    // We just compute `sqrt(m_num * sc⁻¹)` directly — the downstream
390    // quantizer's SA_SCALE handles absolute scaling.
391    if sc < 1e-12 {
392        return 0.0;
393    }
394    (m_num / sc).sqrt()
395}
396
397/// Unvoiced spectral amplitude: RMS per-bin observed energy scaled
398/// by 0.1454 (compensates for unvoiced-synthesis overshoot). Float
399/// port of OP25's `unvoiced_sa_calc(M_num, bin_count)`.
400#[inline]
401fn unvoiced_sa_calc(m_num: f32, bin_count: usize) -> f32 {
402    if bin_count == 0 {
403        return 0.0;
404    }
405    #[allow(clippy::cast_precision_loss, reason = "bin_count ≤ 128")]
406    let n = bin_count as f32;
407    0.1454 * (m_num / n).sqrt() * (2.0_f32).sqrt()
408}
409
410#[cfg(test)]
411mod tests {
412    use super::{MAX_BANDS, VuvState, detect_vuv, detect_vuv_and_sa};
413    use realfft::num_complex::Complex;
414
415    /// A silent spectrum is classified as entirely unvoiced (all
416    /// bands false).
417    #[test]
418    fn silent_spectrum_is_unvoiced() {
419        let fft_out = vec![Complex::new(0.0, 0.0); 129];
420        let decisions = detect_vuv(&fft_out, 6.4);
421        assert!(decisions.voiced.iter().all(|&v| !v));
422    }
423
424    /// A spectrum shaped like the analysis window's sinusoidal
425    /// response — `wr_sp` centered at each `k · f0_bin` — should be
426    /// classified as voiced. Single-bin-impulse inputs don't work
427    /// for the integrated detector: its sinusoidal fit expects the
428    /// tapered `wr_sp` spectral lobe the real encoder's analysis
429    /// window produces, so impulses leave `D_num` large and the ratio
430    /// test fails even though the spectrum "looks" harmonic.
431    #[test]
432    fn harmonic_spectrum_is_voiced() {
433        use crate::encode::wr_sp::{WR_SP, WR_SP_CENTER};
434        let mut fft_out = vec![Complex::new(0.0, 0.0); 129];
435        let f0_bin = 6.4_f32;
436        // Paint the wr_sp lobe at each harmonic. The lobe spans ≈2.5
437        // bins either side of center (160/64 ≈ 2.5), tapered to zero
438        // at the edges.
439        for k in 1..=10 {
440            #[allow(
441                clippy::cast_precision_loss,
442                reason = "k ∈ [1, 10]; f32 rounding is exact below 2^24"
443            )]
444            let center = f0_bin * k as f32;
445            for (wr_idx, &w) in WR_SP.iter().enumerate() {
446                #[allow(
447                    clippy::cast_precision_loss,
448                    reason = "wr_idx ∈ [0, 321); exact in f32"
449                )]
450                let offset_bins = (wr_idx as f32 - WR_SP_CENTER as f32) / 64.0;
451                // `bin` is bounded: center ∈ [6.4, 64], offset_bins
452                // ∈ [−2.5, 2.5], so `bin ∈ [3, 67]` — well within
453                // usize and within the 129-bin FFT.
454                #[allow(
455                    clippy::cast_possible_truncation,
456                    clippy::cast_sign_loss,
457                    reason = "bounded per comment above; truncation is exact for `.round()`"
458                )]
459                let bin = (center + offset_bins).round() as usize;
460                if let Some(c) = fft_out.get_mut(bin) {
461                    c.re += w;
462                }
463            }
464        }
465        let decisions = detect_vuv(&fft_out, f0_bin);
466        let any_voiced = decisions.voiced.iter().any(|&v| v);
467        assert!(
468            any_voiced,
469            "expected at least one voiced band; got {:?}",
470            decisions.voiced
471        );
472    }
473
474    /// A flat spectrum without clear harmonic peaks should not be
475    /// called voiced.
476    #[test]
477    fn flat_spectrum_is_not_voiced() {
478        let mut fft_out = vec![Complex::new(0.0, 0.0); 129];
479        for c in &mut fft_out {
480            *c = Complex::new(0.1, 0.0);
481        }
482        let decisions = detect_vuv(&fft_out, 6.4);
483        let voiced_count = decisions.voiced.iter().filter(|&&v| v).count();
484        // A flat spectrum's sinusoidal fit has residual error
485        // comparable to the signal itself → Dk is near 1 → not voiced.
486        assert!(
487            voiced_count < decisions.num_bands,
488            "all bands voiced for flat spectrum: {voiced_count}/{}",
489            decisions.num_bands
490        );
491    }
492
493    /// `num_bands` ≈ `num_harms / 3`, capped at `MAX_BANDS`.
494    #[test]
495    fn band_count_tracks_pitch() {
496        let fft_out = vec![Complex::new(0.0, 0.0); 129];
497        // f0_bin=2 ⇒ period=128 ⇒ num_harms≈floor((64+0.5)·0.9254)=59→56 → num_bands=12
498        let d_low = detect_vuv(&fft_out, 2.0);
499        assert_eq!(d_low.num_bands, MAX_BANDS);
500        // f0_bin=20 ⇒ period=12.8 ⇒ num_harms≈floor(6.4·0.9254)=5→9 (clamped) → num_bands=3
501        let d_high = detect_vuv(&fft_out, 20.0);
502        assert!(d_high.num_bands >= 1 && d_high.num_bands <= 4);
503    }
504
505    /// Hysteresis: a band that was voiced is easier to keep voiced
506    /// on the next frame.
507    #[test]
508    fn hysteresis_biases_voicing_decision() {
509        // Build a spectrum that's a borderline voicing case — the
510        // same ratio produces different decisions depending on
511        // `state.prev_voiced`.
512        let mut fft_out = vec![Complex::new(0.0, 0.0); 129];
513        let f0_bin = 6.4_f32;
514        for k in 1..=10 {
515            #[allow(
516                clippy::cast_precision_loss,
517                clippy::cast_possible_truncation,
518                clippy::cast_sign_loss
519            )]
520            let bin = (f0_bin * k as f32).round() as usize;
521            if let Some(c) = fft_out.get_mut(bin) {
522                *c = Complex::new(0.6, 0.0);
523            }
524        }
525        // Add some background noise.
526        for c in &mut fft_out {
527            c.re += 0.15;
528        }
529
530        // Fresh state: all bands unvoiced → harder threshold.
531        let mut fresh = VuvState::new();
532        let (d1, _) = detect_vuv_and_sa(&fft_out, f0_bin, &mut fresh, 0.0);
533
534        // Biased state: band 0 was voiced → easier threshold.
535        let mut biased = VuvState::new();
536        biased.prev_voiced[0] = true;
537        let (d2, _) = detect_vuv_and_sa(&fft_out, f0_bin, &mut biased, 0.0);
538
539        // In the biased case, at least as many bands voiced as fresh.
540        let count_fresh = d1.voiced.iter().filter(|&&v| v).count();
541        let count_biased = d2.voiced.iter().filter(|&&v| v).count();
542        assert!(
543            count_biased >= count_fresh,
544            "hysteresis failed to bias voicing: fresh={count_fresh}, biased={count_biased}"
545        );
546    }
547
548    /// Integrated SA path should produce non-zero magnitudes when the
549    /// spectrum has visible harmonic content.
550    #[test]
551    fn integrated_sa_nonzero_for_harmonic_input() {
552        let mut fft_out = vec![Complex::new(0.0, 0.0); 129];
553        let f0_bin = 6.4_f32;
554        for k in 1..=10 {
555            #[allow(
556                clippy::cast_precision_loss,
557                clippy::cast_possible_truncation,
558                clippy::cast_sign_loss
559            )]
560            let bin = (f0_bin * k as f32).round() as usize;
561            if let Some(c) = fft_out.get_mut(bin) {
562                *c = Complex::new(1.0, 0.0);
563            }
564        }
565        let mut state = VuvState::new();
566        let (_vuv, amps) = detect_vuv_and_sa(&fft_out, f0_bin, &mut state, 0.0);
567        let total: f32 = amps.magnitudes.iter().sum();
568        assert!(
569            total > 0.0,
570            "integrated SA all zero on harmonic input ({amps:?})"
571        );
572    }
573}
mbelib_rs/encode/vuv.rs

mbelib_rs/encode/
vuv.rs