• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

cgevans / equiconc / 24938269110

25 Apr 2026 07:00PM UTC coverage: 91.261% (-0.8%) from 92.103%
24938269110

push

github

web-flow
Merge pull request #10 from cgevans/web-frontend

Web frontend

550 of 614 new or added lines in 4 files covered. (89.58%)

10 existing lines in 1 file now uncovered.

2997 of 3284 relevant lines covered (91.26%)

6960.48 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.55
/src/simd_kernels.rs
1
//! Vectorized element-wise kernels for the per-species hot loops in
2
//! `evaluate_into` / `evaluate_log_into` (and their cheap-objective
3
//! cousins).
4
//!
5
//! Two compile paths:
6
//! - With `--features simd` (default), bodies dispatch through `pulp` to
7
//!   the best SIMD ISA available at runtime (SSE2 → AVX2 → AVX-512 on
8
//!   x86; NEON on aarch64; scalar on wasm).
9
//! - With `--no-default-features`, every kernel is a plain `for` loop
10
//!   over `f64`, calling `f64::exp` from libm. Bit-identical to the
11
//!   pre-0.4 behaviour.
12
//!
13
//! Numerical contract: the linear-path kernels (`min_clamp_exp_*`)
14
//! evaluate `exp` via a degree-12 Taylor polynomial after Cody-Waite
15
//! range reduction, accurate to ≤2 ulps. The log-path kernels
16
//! (`fused_lse_and_exp_clamp`, `lse_sum`) drop to scalar libm `exp`
17
//! per-lane via pulp's partial_store/load — the trust-region step
18
//! acceptance on `g = ln f` requires per-iteration progress measurable
19
//! above `4·eps·|g|`, and the polynomial's 1–2 ulp residual stagnates
20
//! the iteration on extremely stiff systems (COFFEE testcase 0 was
21
//! the canonical failure). The log-path kernels keep SIMD parallelism
22
//! for the mins, adds, the Neumaier-compensated LSE reduction, and
23
//! the clamped-exp store, which is still a measurable end-to-end win
24
//! over the scalar path.
25
//!
26
//! Both `fused_lse_and_exp_clamp` and `lse_sum` use the same libm exp
27
//! and same compensated reduction so the trust-region ρ check
28
//! (`g_old` from the full eval, `cand.g` from the candidate eval)
29
//! sees consistent rounding; mixing libm and polynomial within one
30
//! solve would break ρ stability. The `Kernels` struct centralizes
31
//! the dispatch so the two evaluators can never disagree.
32
//!
33
//! Outer scalar `lse.exp()` / `f.ln()` calls in `evaluate_log_into`
34
//! stay on libm — they're single calls per iteration, not loops, and
35
//! libm precision protects the `f_positive` cancellation detection.
36

37
#[cfg(feature = "simd")]
38
use pulp::{Arch, Simd, WithSimd};
39

40
/// Cached SIMD dispatch state. `Kernels::new()` is cheap (a single
41
/// CPUID query the first time, branch-on-cached-result thereafter), and
42
/// the `Kernels` instance is stashed on `System` so each `solve()` pays
43
/// it at most once.
44
#[derive(Clone, Copy, Default)]
45
pub(crate) struct Kernels {
46
    #[cfg(feature = "simd")]
47
    arch: Arch,
48
}
49

50
impl std::fmt::Debug for Kernels {
NEW
51
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
×
NEW
52
        f.debug_struct("Kernels").finish()
×
NEW
53
    }
×
54
}
55

56
impl Kernels {
57
    pub(crate) fn new() -> Self {
3,457✔
58
        Self {
3,457✔
59
            #[cfg(feature = "simd")]
3,457✔
60
            arch: Arch::new(),
3,457✔
61
        }
3,457✔
62
    }
3,457✔
63

64
    /// `c += log_q`, contiguous slices, equal length.
65
    #[inline]
66
    pub(crate) fn add_inplace(self, c: &mut [f64], log_q: &[f64]) {
95,064✔
67
        debug_assert_eq!(c.len(), log_q.len());
95,064✔
68
        #[cfg(feature = "simd")]
69
        {
70
            self.arch.dispatch(AddInplace { c, log_q });
71
        }
72
        #[cfg(not(feature = "simd"))]
73
        {
74
            for (ci, &qi) in c.iter_mut().zip(log_q.iter()) {
533,831✔
75
                *ci += qi;
533,831✔
76
            }
533,831✔
77
        }
78
    }
95,064✔
79

80
    /// `grad -= c0`, contiguous slices, equal length.
81
    #[inline]
82
    pub(crate) fn sub_inplace(self, grad: &mut [f64], c0: &[f64]) {
48,592✔
83
        debug_assert_eq!(grad.len(), c0.len());
48,592✔
84
        #[cfg(feature = "simd")]
85
        {
86
            self.arch.dispatch(SubInplace { grad, c0 });
87
        }
88
        #[cfg(not(feature = "simd"))]
89
        {
90
            for (gi, &ci) in grad.iter_mut().zip(c0.iter()) {
95,341✔
91
                *gi -= ci;
95,341✔
92
            }
95,341✔
93
        }
94
    }
48,592✔
95

96
    /// `out = src * a` for the gradient rescale `grad_g = grad / f`.
97
    /// Length is `n_mon` (small), so this is mostly here to keep the
98
    /// scalar/SIMD split symmetric.
99
    #[inline]
100
    pub(crate) fn scale_into(self, out: &mut [f64], src: &[f64], a: f64) {
5,581✔
101
        debug_assert_eq!(out.len(), src.len());
5,581✔
102
        #[cfg(feature = "simd")]
103
        {
104
            self.arch.dispatch(ScaleInto { out, src, a });
105
        }
106
        #[cfg(not(feature = "simd"))]
107
        {
108
            for (oi, &si) in out.iter_mut().zip(src.iter()) {
12,631✔
109
                *oi = si * a;
12,631✔
110
            }
12,631✔
111
        }
112
    }
5,581✔
113

114
    /// Linear-path one-pass kernel: `t[j] = exp(min(t[j], log_c_clamp))`,
115
    /// returning `Σ_j t[j]` (the post-exp values).
116
    ///
117
    /// Combines `c.mapv_inplace(|x| x.min(clamp).exp())` and the
118
    /// subsequent `c.sum()` into a single sweep.
119
    #[inline]
120
    pub(crate) fn min_clamp_exp_inplace_sum(self, t: &mut [f64], log_c_clamp: f64) -> f64 {
43,013✔
121
        #[cfg(feature = "simd")]
122
        {
123
            self.arch.dispatch(MinClampExpInplaceSum { t, log_c_clamp })
124
        }
125
        #[cfg(not(feature = "simd"))]
126
        {
127
            let mut sum = 0.0;
43,013✔
128
            for x in t.iter_mut() {
294,452✔
129
                let e = x.min(log_c_clamp).exp();
294,452✔
130
                *x = e;
294,452✔
131
                sum += e;
294,452✔
132
            }
294,452✔
133
            sum
43,013✔
134
        }
135
    }
43,013✔
136

137
    /// Read-only variant for the cheap-objective path
138
    /// (`evaluate_objective_into`): returns `Σ_j exp(min(t[j], clamp))`
139
    /// without writing back to `t`.
140
    #[inline]
141
    pub(crate) fn min_clamp_exp_sum(self, t: &[f64], log_c_clamp: f64) -> f64 {
40,772✔
142
        #[cfg(feature = "simd")]
143
        {
144
            self.arch.dispatch(MinClampExpSum { t, log_c_clamp })
145
        }
146
        #[cfg(not(feature = "simd"))]
147
        {
148
            t.iter().map(|&x| x.min(log_c_clamp).exp()).sum()
228,123✔
149
        }
150
    }
40,772✔
151

152
    /// Log-path fused kernel: produces `(t_max, sum_shifted)` and
153
    /// rewrites `t[j] ← exp(min(t[j], log_c_clamp))` in place.
154
    ///
155
    /// `sum_shifted = Σ_j exp(t_unclamped[j] − t_max)`. The unclamped
156
    /// LSE is the load-bearing quantity for the trust-region step
157
    /// acceptance — clamping inside the LSE biases `f`. The clamp is
158
    /// applied only when realising `c` for the gradient/Hessian.
159
    ///
160
    /// Implementation: two SIMD passes (max-reduction, then fused
161
    /// shifted-exp + clamped-exp). The original scalar code does three
162
    /// passes; the saved memory traffic alone is ~33%.
163
    #[inline]
164
    pub(crate) fn fused_lse_and_exp_clamp(self, t: &mut [f64], log_c_clamp: f64) -> (f64, f64) {
5,582✔
165
        #[cfg(feature = "simd")]
166
        {
167
            self.arch.dispatch(FusedLseAndExpClamp { t, log_c_clamp })
168
        }
169
        #[cfg(not(feature = "simd"))]
170
        {
171
            let mut t_max = f64::NEG_INFINITY;
5,582✔
172
            for &x in t.iter() {
36,086✔
173
                if x > t_max {
36,086✔
174
                    t_max = x;
12,822✔
175
                }
23,264✔
176
            }
177
            let mut sum_shifted = 0.0;
5,582✔
178
            for x in t.iter_mut() {
36,086✔
179
                sum_shifted += (*x - t_max).exp();
36,086✔
180
                *x = x.min(log_c_clamp).exp();
36,086✔
181
            }
36,086✔
182
            (t_max, sum_shifted)
5,582✔
183
        }
184
    }
5,582✔
185

186
    /// Cheap-objective log path: returns `(t_max, sum_shifted)` without
187
    /// modifying `t`. Mirrors `fused_lse_and_exp_clamp` but read-only.
188
    #[inline]
189
    pub(crate) fn lse_sum(self, t: &[f64]) -> (f64, f64) {
5,703✔
190
        #[cfg(feature = "simd")]
191
        {
192
            self.arch.dispatch(LseSum { t })
193
        }
194
        #[cfg(not(feature = "simd"))]
195
        {
196
            let mut t_max = f64::NEG_INFINITY;
5,703✔
197
            for &x in t.iter() {
36,875✔
198
                if x > t_max {
36,875✔
199
                    t_max = x;
13,119✔
200
                }
23,756✔
201
            }
202
            let mut sum_shifted = 0.0;
5,703✔
203
            for &x in t.iter() {
36,875✔
204
                sum_shifted += (x - t_max).exp();
36,875✔
205
            }
36,875✔
206
            (t_max, sum_shifted)
5,703✔
207
        }
208
    }
5,703✔
209
}
210

211
// =====================================================================
212
// SIMD bodies (only compiled with `--features simd`).
213
// =====================================================================
214

215
#[cfg(feature = "simd")]
216
mod simd_impl {
217
    use super::*;
218

219
    /// libm-precision SIMD `exp` via per-lane partial_store → scalar
220
    /// libm → partial_load.
221
    ///
222
    /// Used by the log-path kernels (`fused_lse_and_exp_clamp`,
223
    /// `lse_sum`) because the trust-region step acceptance on the
224
    /// log-objective drives `g = ln f` toward `O(1)` while requiring
225
    /// per-iteration progress to be measurable above
226
    /// `4·eps·|g|` ≈ 6e-15 — borderline territory where the
227
    /// polynomial's 1–2 ulp residual can stagnate the iteration on
228
    /// extremely stiff systems (e.g. COFFEE testcase 0 with `c0` in
229
    /// the nM range and `n_species ≈ 54k`).
230
    ///
231
    /// We give up the polynomial speedup on the exp call itself but
232
    /// keep SIMD parallelism for everything around it (mins, adds,
233
    /// the LSE compensated reduction, the clamped-exp store), which
234
    /// still nets a measurable end-to-end win against the scalar
235
    /// path. The linear-path kernels still use the polynomial
236
    /// `simd_exp` below — the linear objective operates in
237
    /// far-from-machine-epsilon territory where the 1–2 ulp residual
238
    /// is not load-bearing for convergence.
239
    #[inline(always)]
240
    pub(super) fn simd_exp_libm<S: Simd>(simd: S, t: S::f64s) -> S::f64s {
241
        let lanes = S::F64_LANES;
242
        let mut t_buf = [0.0f64; 8];
243
        simd.partial_store_f64s(&mut t_buf[..lanes], t);
244
        let mut out = [0.0f64; 8];
245
        for i in 0..lanes {
246
            out[i] = t_buf[i].exp();
247
        }
248
        simd.partial_load_f64s(&out[..lanes])
249
    }
250

251
    /// Vectorized `exp(t)` accurate to ~2 ulps on the domain we feed it
252
    /// (Aᵀλ + log_q values, typically within ±~700 — large positive
253
    /// values are clamped before the kernel sees them in the linear
254
    /// path, but the log-path kernel may pass un-clamped values
255
    /// straight to `exp(t - t_max)` where `t - t_max ≤ 0`, so the
256
    /// negative tail is the operating regime).
257
    ///
258
    /// Method: range-reduce `t = k·ln 2 + r`, evaluate `exp(r)` on
259
    /// `r ∈ [−ln 2 / 2, ln 2 / 2]` via Estrin-evaluated degree-12
260
    /// minimax, then `exp(t) = 2^k · exp(r)`. The 2^k step is the only
261
    /// place we need bit-shifts; pulp 0.22 doesn't expose 64-bit SIMD
262
    /// shifts, so we drop to scalar via `partial_store_f64s` →
263
    /// per-lane bit construction → `partial_load_f64s`. With the
264
    /// polynomial cost dominating, the few-lane scalar epilogue is in
265
    /// the noise.
266
    #[inline(always)]
267
    pub(super) fn simd_exp<S: Simd>(simd: S, t: S::f64s) -> S::f64s {
268
        // Cody-Waite split of ln(2) into hi/lo doubles, exact to ~108 bits.
269
        let ln2_hi = simd.splat_f64s(0.693_147_180_369_123_8); // 0x3FE62E42_FEE00000
270
        let ln2_lo = simd.splat_f64s(1.908_214_929_270_587_7e-10);
271
        let inv_ln2 = simd.splat_f64s(std::f64::consts::LOG2_E);
272

273
        // Round-to-nearest-even via the magic-constant trick: works for
274
        // any sign provided |t * inv_ln2| < 2^52.
275
        let magic = simd.splat_f64s((1u64 << 52) as f64 + (1u64 << 51) as f64);
276
        let kf = simd.mul_add_f64s(t, inv_ln2, magic);
277
        let kf = simd.sub_f64s(kf, magic);
278

279
        // r = t − k·ln 2, computed as (t − k·ln2_hi) − k·ln2_lo via FMA.
280
        let r = simd.mul_add_f64s(simd.neg_f64s(kf), ln2_hi, t);
281
        let r = simd.mul_add_f64s(simd.neg_f64s(kf), ln2_lo, r);
282

283
        // Degree-12 minimax for exp(r) on [−ln 2/2, ln 2/2]. The
284
        // truncated Taylor coefficients are within machine precision of
285
        // the minimax coefficients on this tiny interval, so we use
286
        // 1/k! directly — easy to read and accurate to <1 ulp before
287
        // the 2^k multiply.
288
        let c12 = simd.splat_f64s(1.0 / 479_001_600.0);
289
        let c11 = simd.splat_f64s(1.0 / 39_916_800.0);
290
        let c10 = simd.splat_f64s(1.0 / 3_628_800.0);
291
        let c9 = simd.splat_f64s(1.0 / 362_880.0);
292
        let c8 = simd.splat_f64s(1.0 / 40_320.0);
293
        let c7 = simd.splat_f64s(1.0 / 5_040.0);
294
        let c6 = simd.splat_f64s(1.0 / 720.0);
295
        let c5 = simd.splat_f64s(1.0 / 120.0);
296
        let c4 = simd.splat_f64s(1.0 / 24.0);
297
        let c3 = simd.splat_f64s(1.0 / 6.0);
298
        let c2 = simd.splat_f64s(0.5);
299
        let c1 = simd.splat_f64s(1.0);
300
        let one = simd.splat_f64s(1.0);
301

302
        // Horner.
303
        let mut p = c12;
304
        p = simd.mul_add_f64s(p, r, c11);
305
        p = simd.mul_add_f64s(p, r, c10);
306
        p = simd.mul_add_f64s(p, r, c9);
307
        p = simd.mul_add_f64s(p, r, c8);
308
        p = simd.mul_add_f64s(p, r, c7);
309
        p = simd.mul_add_f64s(p, r, c6);
310
        p = simd.mul_add_f64s(p, r, c5);
311
        p = simd.mul_add_f64s(p, r, c4);
312
        p = simd.mul_add_f64s(p, r, c3);
313
        p = simd.mul_add_f64s(p, r, c2);
314
        p = simd.mul_add_f64s(p, r, c1);
315
        p = simd.mul_add_f64s(p, r, one);
316

317
        // 2^k via per-lane bit construction.
318
        //
319
        // Buffer is sized for the widest f64 SIMD on stable Rust
320
        // (AVX-512: 8 lanes). Three regimes:
321
        //
322
        // - `k ∈ [-1022, 1023]`: biased exponent (k + 1023) is in
323
        //   [1, 2046], fits in 11 bits, gives a normal f64 directly.
324
        // - `k ∈ [-1074, -1023]`: subnormal range. We can't construct
325
        //   the value with one shift, but the two-mul split
326
        //   `2^k = 2^-1022 · 2^(k+1022)` works because `k+1022 ∈
327
        //   [-52, -1]` is well inside the normal range. f64
328
        //   multiplication produces the correct subnormal result.
329
        // - `k < -1074`: true underflow → 0.0.
330
        // - `k > 1023`: saturate to `2^1023` (largest finite power of
331
        //   two). In `evaluate_into` inputs are clamped to
332
        //   `log_c_clamp ≤ 700` before the kernel sees them; in the
333
        //   log path `t − t_max ≤ 0`. This branch is therefore
334
        //   defence-in-depth.
335
        //
336
        // An earlier version sign-extended `(k + 1023) as u64` for
337
        // negative `k`, producing a huge negative f64 instead of
338
        // underflowing to zero; the
339
        // `min_clamp_exp_inplace_sum_stress_large` and `simd_exp_
340
        // accuracy_sweep` tests gate against regression there.
341
        let lanes = S::F64_LANES;
342
        let mut k_buf = [0.0f64; 8];
343
        simd.partial_store_f64s(&mut k_buf[..lanes], kf);
344
        let mut pow2_buf = [0.0f64; 8];
345
        for i in 0..lanes {
346
            let k = k_buf[i] as i64;
347
            if k > 1023 {
348
                pow2_buf[i] = f64::from_bits(((1023_i64 + 1023) as u64) << 52);
349
            } else if k >= -1022 {
350
                pow2_buf[i] = f64::from_bits(((k + 1023) as u64) << 52);
351
            } else if k >= -1074 {
352
                // k + 1022 ∈ [-52, -1] — normal f64.
353
                let small_normal = f64::from_bits(((k + 1022 + 1023) as u64) << 52);
354
                let smallest_normal = f64::from_bits(1u64 << 52); // 2^-1022
355
                pow2_buf[i] = smallest_normal * small_normal;
356
            } else {
357
                pow2_buf[i] = 0.0;
358
            }
359
        }
360
        let pow2 = simd.partial_load_f64s(&pow2_buf[..lanes]);
361

362
        simd.mul_f64s(p, pow2)
363
    }
364

365
    pub(super) struct AddInplace<'a> {
366
        pub c: &'a mut [f64],
367
        pub log_q: &'a [f64],
368
    }
369
    impl<'a> WithSimd for AddInplace<'a> {
370
        type Output = ();
371
        #[inline(always)]
372
        fn with_simd<S: Simd>(self, simd: S) {
373
            let Self { c, log_q } = self;
374
            let (c_head, c_tail) = S::as_mut_simd_f64s(c);
375
            let (q_head, q_tail) = S::as_simd_f64s(log_q);
376
            for (cv, qv) in c_head.iter_mut().zip(q_head.iter()) {
377
                *cv = simd.add_f64s(*cv, *qv);
378
            }
379
            for (cv, &qv) in c_tail.iter_mut().zip(q_tail.iter()) {
380
                *cv += qv;
381
            }
382
        }
383
    }
384

385
    pub(super) struct SubInplace<'a> {
386
        pub grad: &'a mut [f64],
387
        pub c0: &'a [f64],
388
    }
389
    impl<'a> WithSimd for SubInplace<'a> {
390
        type Output = ();
391
        #[inline(always)]
392
        fn with_simd<S: Simd>(self, simd: S) {
393
            let Self { grad, c0 } = self;
394
            let (g_head, g_tail) = S::as_mut_simd_f64s(grad);
395
            let (c_head, c_tail) = S::as_simd_f64s(c0);
396
            for (gv, cv) in g_head.iter_mut().zip(c_head.iter()) {
397
                *gv = simd.sub_f64s(*gv, *cv);
398
            }
399
            for (gv, &cv) in g_tail.iter_mut().zip(c_tail.iter()) {
400
                *gv -= cv;
401
            }
402
        }
403
    }
404

405
    pub(super) struct ScaleInto<'a> {
406
        pub out: &'a mut [f64],
407
        pub src: &'a [f64],
408
        pub a: f64,
409
    }
410
    impl<'a> WithSimd for ScaleInto<'a> {
411
        type Output = ();
412
        #[inline(always)]
413
        fn with_simd<S: Simd>(self, simd: S) {
414
            let Self { out, src, a } = self;
415
            let av = simd.splat_f64s(a);
416
            let (o_head, o_tail) = S::as_mut_simd_f64s(out);
417
            let (s_head, s_tail) = S::as_simd_f64s(src);
418
            for (ov, sv) in o_head.iter_mut().zip(s_head.iter()) {
419
                *ov = simd.mul_f64s(*sv, av);
420
            }
421
            for (ov, &sv) in o_tail.iter_mut().zip(s_tail.iter()) {
422
                *ov = sv * a;
423
            }
424
        }
425
    }
426

427
    pub(super) struct MinClampExpInplaceSum<'a> {
428
        pub t: &'a mut [f64],
429
        pub log_c_clamp: f64,
430
    }
431
    impl<'a> WithSimd for MinClampExpInplaceSum<'a> {
432
        type Output = f64;
433
        #[inline(always)]
434
        fn with_simd<S: Simd>(self, simd: S) -> f64 {
435
            let Self { t, log_c_clamp } = self;
436
            let clamp_v = simd.splat_f64s(log_c_clamp);
437
            let mut sum_v = simd.splat_f64s(0.0);
438
            let (head, tail) = S::as_mut_simd_f64s(t);
439
            for chunk in head.iter_mut() {
440
                let clamped = simd.min_f64s(*chunk, clamp_v);
441
                let e = simd_exp(simd, clamped);
442
                *chunk = e;
443
                sum_v = simd.add_f64s(sum_v, e);
444
            }
445
            let mut sum = simd.reduce_sum_f64s(sum_v);
446
            for x in tail.iter_mut() {
447
                let e = x.min(log_c_clamp).exp();
448
                *x = e;
449
                sum += e;
450
            }
451
            sum
452
        }
453
    }
454

455
    pub(super) struct MinClampExpSum<'a> {
456
        pub t: &'a [f64],
457
        pub log_c_clamp: f64,
458
    }
459
    impl<'a> WithSimd for MinClampExpSum<'a> {
460
        type Output = f64;
461
        #[inline(always)]
462
        fn with_simd<S: Simd>(self, simd: S) -> f64 {
463
            let Self { t, log_c_clamp } = self;
464
            let clamp_v = simd.splat_f64s(log_c_clamp);
465
            let mut sum_v = simd.splat_f64s(0.0);
466
            let (head, tail) = S::as_simd_f64s(t);
467
            for &chunk in head.iter() {
468
                let clamped = simd.min_f64s(chunk, clamp_v);
469
                let e = simd_exp(simd, clamped);
470
                sum_v = simd.add_f64s(sum_v, e);
471
            }
472
            let mut sum = simd.reduce_sum_f64s(sum_v);
473
            for &x in tail.iter() {
474
                sum += x.min(log_c_clamp).exp();
475
            }
476
            sum
477
        }
478
    }
479

480
    pub(super) struct FusedLseAndExpClamp<'a> {
481
        pub t: &'a mut [f64],
482
        pub log_c_clamp: f64,
483
    }
484
    impl<'a> WithSimd for FusedLseAndExpClamp<'a> {
485
        type Output = (f64, f64);
486
        #[inline(always)]
487
        fn with_simd<S: Simd>(self, simd: S) -> (f64, f64) {
488
            let Self { t, log_c_clamp } = self;
489

490
            // Pass 1: t_max via SIMD reduction.
491
            let neg_inf = simd.splat_f64s(f64::NEG_INFINITY);
492
            let mut max_v = neg_inf;
493
            {
494
                let (head, tail) = S::as_simd_f64s(t);
495
                for &chunk in head.iter() {
496
                    max_v = simd.max_f64s(max_v, chunk);
497
                }
498
                let mut t_max_scalar = simd.reduce_max_f64s(max_v);
499
                for &x in tail.iter() {
500
                    if x > t_max_scalar {
501
                        t_max_scalar = x;
502
                    }
503
                }
504
                // Broadcast back into a SIMD register for pass 2.
505
                max_v = simd.splat_f64s(t_max_scalar);
506
            }
507
            let t_max_scalar = simd.reduce_max_f64s(max_v);
508

509
            // Pass 2: write c = exp(min(t, clamp)); accumulate
510
            // Σ exp(t − t_max) with Neumaier-compensated summation.
511
            //
512
            // The trust-region step acceptance can drift past the
513
            // strict convergence boundary on extremely stiff systems
514
            // (n_species in the 50k+ range with c0 in the nM regime —
515
            // COFFEE's testcase 0 is the canonical failure mode here)
516
            // when the LSE summation accumulates O(n_species·eps)
517
            // rounding. Compensating the SIMD per-lane accumulator
518
            // with a parallel "lost-bits" register costs one extra
519
            // FMA-equivalent op per chunk and brings the total error
520
            // down to O(eps²). The cross-lane reduction at the end is
521
            // small (≤ 8 lanes on AVX-512) and uses the same
522
            // compensation.
523
            let clamp_v = simd.splat_f64s(log_c_clamp);
524
            let mut sum_v = simd.splat_f64s(0.0);
525
            let mut comp_v = simd.splat_f64s(0.0);
526
            let (head, tail) = S::as_mut_simd_f64s(t);
527
            for chunk in head.iter_mut() {
528
                let raw = *chunk;
529
                let clamped = simd.min_f64s(raw, clamp_v);
530
                let e_clamped = simd_exp_libm(simd, clamped);
531
                let shifted = simd.sub_f64s(raw, max_v);
532
                let e_shifted = simd_exp_libm(simd, shifted);
533
                *chunk = e_clamped;
534
                // Neumaier compensation. `t_new = sum + e_shifted`
535
                // exactly when |sum| ≥ |e_shifted|; otherwise the
536
                // roles swap. Either way the lost low-order bits are
537
                // captured into `comp_v`.
538
                let t_new = simd.add_f64s(sum_v, e_shifted);
539
                let abs_sum = simd.abs_f64s(sum_v);
540
                let abs_e = simd.abs_f64s(e_shifted);
541
                let sum_bigger = simd.greater_than_or_equal_f64s(abs_sum, abs_e);
542
                let lost_when_sum_bigger = simd.add_f64s(simd.sub_f64s(sum_v, t_new), e_shifted);
543
                let lost_when_e_bigger = simd.add_f64s(simd.sub_f64s(e_shifted, t_new), sum_v);
544
                let lost = simd.select_f64s(sum_bigger, lost_when_sum_bigger, lost_when_e_bigger);
545
                comp_v = simd.add_f64s(comp_v, lost);
546
                sum_v = t_new;
547
            }
548
            let lane_sum = simd.reduce_sum_f64s(sum_v);
549
            let lane_comp = simd.reduce_sum_f64s(comp_v);
550
            let mut sum_shifted = lane_sum + lane_comp;
551
            // Tail epilogue with the same compensation pattern.
552
            let mut comp = 0.0_f64;
553
            for x in tail.iter_mut() {
554
                let raw = *x;
555
                let e_shifted = (raw - t_max_scalar).exp();
556
                let t_new = sum_shifted + e_shifted;
557
                if sum_shifted.abs() >= e_shifted.abs() {
558
                    comp += (sum_shifted - t_new) + e_shifted;
559
                } else {
560
                    comp += (e_shifted - t_new) + sum_shifted;
561
                }
562
                sum_shifted = t_new;
563
                *x = raw.min(log_c_clamp).exp();
564
            }
565
            sum_shifted += comp;
566
            (t_max_scalar, sum_shifted)
567
        }
568
    }
569

570
    pub(super) struct LseSum<'a> {
571
        pub t: &'a [f64],
572
    }
573
    impl<'a> WithSimd for LseSum<'a> {
574
        type Output = (f64, f64);
575
        #[inline(always)]
576
        fn with_simd<S: Simd>(self, simd: S) -> (f64, f64) {
577
            let Self { t } = self;
578
            let neg_inf = simd.splat_f64s(f64::NEG_INFINITY);
579
            let mut max_v = neg_inf;
580
            let (head, tail) = S::as_simd_f64s(t);
581
            for &chunk in head.iter() {
582
                max_v = simd.max_f64s(max_v, chunk);
583
            }
584
            let mut t_max = simd.reduce_max_f64s(max_v);
585
            for &x in tail.iter() {
586
                if x > t_max {
587
                    t_max = x;
588
                }
589
            }
590
            let max_v = simd.splat_f64s(t_max);
591
            // Neumaier-compensated SIMD sum — see the equivalent
592
            // section in `FusedLseAndExpClamp` for the rationale; the
593
            // candidate evaluator must match the full evaluator's
594
            // sum semantics so the trust-region ρ check sees a
595
            // consistent g vs cand.g pair.
596
            let mut sum_v = simd.splat_f64s(0.0);
597
            let mut comp_v = simd.splat_f64s(0.0);
598
            for &chunk in head.iter() {
599
                let shifted = simd.sub_f64s(chunk, max_v);
600
                let e = simd_exp_libm(simd, shifted);
601
                let t_new = simd.add_f64s(sum_v, e);
602
                let abs_sum = simd.abs_f64s(sum_v);
603
                let abs_e = simd.abs_f64s(e);
604
                let sum_bigger = simd.greater_than_or_equal_f64s(abs_sum, abs_e);
605
                let lost_when_sum_bigger = simd.add_f64s(simd.sub_f64s(sum_v, t_new), e);
606
                let lost_when_e_bigger = simd.add_f64s(simd.sub_f64s(e, t_new), sum_v);
607
                let lost = simd.select_f64s(sum_bigger, lost_when_sum_bigger, lost_when_e_bigger);
608
                comp_v = simd.add_f64s(comp_v, lost);
609
                sum_v = t_new;
610
            }
611
            let lane_sum = simd.reduce_sum_f64s(sum_v);
612
            let lane_comp = simd.reduce_sum_f64s(comp_v);
613
            let mut sum = lane_sum + lane_comp;
614
            let mut comp = 0.0_f64;
615
            for &x in tail.iter() {
616
                let e = (x - t_max).exp();
617
                let t_new = sum + e;
618
                if sum.abs() >= e.abs() {
619
                    comp += (sum - t_new) + e;
620
                } else {
621
                    comp += (e - t_new) + sum;
622
                }
623
                sum = t_new;
624
            }
625
            sum += comp;
626
            (t_max, sum)
627
        }
628
    }
629
}
630

631
#[cfg(feature = "simd")]
632
use simd_impl::{
633
    AddInplace, FusedLseAndExpClamp, LseSum, MinClampExpInplaceSum, MinClampExpSum, ScaleInto,
634
    SubInplace,
635
};
636

637
#[cfg(test)]
638
mod tests {
639
    use super::*;
640

641
    fn rel_diff(a: f64, b: f64) -> f64 {
5✔
642
        let denom = a.abs().max(b.abs()).max(f64::MIN_POSITIVE);
5✔
643
        ((a - b).abs()) / denom
5✔
644
    }
5✔
645

646
    fn ulp_diff(a: f64, b: f64) -> u64 {
56,756✔
647
        if a == b {
56,756✔
648
            return 0;
56,756✔
649
        }
650
        let abits = a.to_bits() as i128;
651
        let bbits = b.to_bits() as i128;
652
        (abits - bbits).unsigned_abs() as u64
653
    }
56,756✔
654

655
    fn sample_inputs() -> Vec<f64> {
5✔
656
        // Lengths cover lane-multiple, lane-multiple+tail, and a single
657
        // sub-lane chunk to exercise the partial path.
658
        let mut v = Vec::new();
5✔
659
        for k in [0, 1, 2, 3, 4, 5, 7, 8, 9, 16, 17, 33, 64, 100, 1000] {
75✔
660
            for i in 0..k {
6,345✔
661
                v.push(((i as f64) / 7.0).sin() * 4.0 - 1.0);
6,345✔
662
            }
6,345✔
663
            v.push(f64::NAN); // sentinel to break runs between sizes
75✔
664
        }
665
        v.retain(|x| x.is_finite());
6,420✔
666
        v
5✔
667
    }
5✔
668

669
    #[test]
670
    fn add_inplace_matches_scalar() {
1✔
671
        let k = Kernels::new();
1✔
672
        let a0: Vec<f64> = (0..127).map(|i| (i as f64) * 0.31 - 5.0).collect();
127✔
673
        let b: Vec<f64> = (0..127).map(|i| (i as f64) * -0.17 + 0.5).collect();
127✔
674
        let mut a = a0.clone();
1✔
675
        k.add_inplace(&mut a, &b);
1✔
676
        for i in 0..a.len() {
127✔
677
            assert_eq!(
127✔
678
                a[i].to_bits(),
127✔
679
                (a0[i] + b[i]).to_bits(),
127✔
680
                "mismatch at i={i}"
681
            );
682
        }
683
    }
1✔
684

685
    #[test]
686
    fn sub_inplace_matches_scalar() {
1✔
687
        let k = Kernels::new();
1✔
688
        let a0: Vec<f64> = (0..127).map(|i| (i as f64) * 0.31 - 5.0).collect();
127✔
689
        let b: Vec<f64> = (0..127).map(|i| (i as f64) * -0.17 + 0.5).collect();
127✔
690
        let mut a = a0.clone();
1✔
691
        k.sub_inplace(&mut a, &b);
1✔
692
        for i in 0..a.len() {
127✔
693
            assert_eq!(
127✔
694
                a[i].to_bits(),
127✔
695
                (a0[i] - b[i]).to_bits(),
127✔
696
                "mismatch at i={i}"
697
            );
698
        }
699
    }
1✔
700

701
    #[test]
702
    fn scale_into_matches_scalar() {
1✔
703
        let k = Kernels::new();
1✔
704
        let src: Vec<f64> = (0..27).map(|i| (i as f64) * 0.31 - 5.0).collect();
27✔
705
        let mut out = vec![0.0; src.len()];
1✔
706
        let a = -2.5_f64;
1✔
707
        k.scale_into(&mut out, &src, a);
1✔
708
        for i in 0..src.len() {
27✔
709
            assert_eq!(
27✔
710
                out[i].to_bits(),
27✔
711
                (src[i] * a).to_bits(),
27✔
712
                "mismatch at i={i}"
713
            );
714
        }
715
    }
1✔
716

717
    #[test]
718
    fn min_clamp_exp_inplace_sum_matches_libm() {
1✔
719
        let k = Kernels::new();
1✔
720
        // Inputs span the negative tail (where exp is precise) and a
721
        // few values above the clamp to verify the min(t, clamp) gate.
722
        let inputs = sample_inputs();
1✔
723
        let log_c_clamp = 50.0_f64;
1✔
724
        let mut t = inputs.clone();
1✔
725
        let got_sum = k.min_clamp_exp_inplace_sum(&mut t, log_c_clamp);
1✔
726

727
        let mut want_sum = 0.0;
1✔
728
        for (i, &x) in inputs.iter().enumerate() {
1,269✔
729
            let want = x.min(log_c_clamp).exp();
1,269✔
730
            let ulps = ulp_diff(t[i], want);
1,269✔
731
            assert!(
1,269✔
732
                ulps <= 8,
1,269✔
733
                "value at i={i}: got {} want {} ({ulps} ulps)",
734
                t[i],
735
                want
736
            );
737
            want_sum += want;
1,269✔
738
        }
739
        let rel = rel_diff(got_sum, want_sum);
1✔
740
        assert!(
1✔
741
            rel < 1e-13,
1✔
742
            "sum rel diff {rel} (got {got_sum} want {want_sum})"
743
        );
744
    }
1✔
745

746
    #[test]
747
    fn min_clamp_exp_sum_matches_libm() {
1✔
748
        let k = Kernels::new();
1✔
749
        let inputs = sample_inputs();
1✔
750
        let log_c_clamp = 50.0_f64;
1✔
751
        let got_sum = k.min_clamp_exp_sum(&inputs, log_c_clamp);
1✔
752
        let want_sum: f64 = inputs.iter().map(|&x| x.min(log_c_clamp).exp()).sum();
1,269✔
753
        let rel = rel_diff(got_sum, want_sum);
1✔
754
        assert!(rel < 1e-13, "rel diff {rel}");
1✔
755
    }
1✔
756

757
    #[test]
758
    fn fused_lse_and_exp_clamp_matches_libm() {
1✔
759
        let k = Kernels::new();
1✔
760
        let inputs = sample_inputs();
1✔
761
        let log_c_clamp = 50.0_f64;
1✔
762
        let mut t = inputs.clone();
1✔
763
        let (t_max_got, sum_shifted_got) = k.fused_lse_and_exp_clamp(&mut t, log_c_clamp);
1✔
764

765
        let t_max_want = inputs.iter().copied().fold(f64::NEG_INFINITY, f64::max);
1✔
766
        assert_eq!(t_max_got, t_max_want, "t_max mismatch");
1✔
767

768
        let sum_shifted_want: f64 = inputs.iter().map(|&x| (x - t_max_want).exp()).sum();
1,269✔
769
        let rel = rel_diff(sum_shifted_got, sum_shifted_want);
1✔
770
        assert!(rel < 1e-13, "sum_shifted rel diff {rel}");
1✔
771

772
        for (i, &x) in inputs.iter().enumerate() {
1,269✔
773
            let want = x.min(log_c_clamp).exp();
1,269✔
774
            let ulps = ulp_diff(t[i], want);
1,269✔
775
            assert!(
1,269✔
776
                ulps <= 8,
1,269✔
777
                "c[i={i}]: got {} want {} ({ulps} ulps)",
778
                t[i],
779
                want
780
            );
781
        }
782
    }
1✔
783

784
    #[test]
785
    fn lse_sum_matches_libm() {
1✔
786
        let k = Kernels::new();
1✔
787
        let inputs = sample_inputs();
1✔
788
        let (t_max_got, sum_got) = k.lse_sum(&inputs);
1✔
789
        let t_max_want = inputs.iter().copied().fold(f64::NEG_INFINITY, f64::max);
1✔
790
        let sum_want: f64 = inputs.iter().map(|&x| (x - t_max_want).exp()).sum();
1,269✔
791
        assert_eq!(t_max_got, t_max_want);
1✔
792
        assert!(rel_diff(sum_got, sum_want) < 1e-13);
1✔
793
    }
1✔
794

795
    /// Confirm Neumaier compensation actually helps: build a sum of
796
    /// 54k values with the magnitude profile of the LSE shifted-exp on
797
    /// COFFEE testcase 0 (most values ~ 1e-90, a few ~ 1) and assert
798
    /// the SIMD reduction error against a scalar Kahan-Babuska-Neumaier
799
    /// reference.
800
    #[test]
801
    #[cfg(feature = "simd")]
802
    fn lse_sum_compensation_active() {
803
        let k = Kernels::new();
804
        // Construct shifted-exp values directly: 15 "near 1" values,
805
        // 54k "tiny" values. t inputs that produce these on simd_exp:
806
        // for value ≈ 1, t ≈ 0; for value ≈ 1e-90, t ≈ -207.
807
        let mut t = Vec::with_capacity(54218);
808
        for i in 0..54218 {
809
            // First 15 indices ~ 0 (so exp ≈ 1).
810
            t.push(if i < 15 {
811
                0.0
812
            } else {
813
                -207.0 + (i as f64) * 1e-6
814
            });
815
        }
816
        let (got_max, got_sum) = k.lse_sum(&t);
817
        assert_eq!(got_max, 0.0);
818

819
        // Reference: Kahan-Neumaier scalar of libm exp values.
820
        let mut ref_sum = 0.0_f64;
821
        let mut ref_comp = 0.0_f64;
822
        for &x in t.iter() {
823
            let e = (x - got_max).exp();
824
            let s = ref_sum + e;
825
            if ref_sum.abs() >= e.abs() {
826
                ref_comp += (ref_sum - s) + e;
827
            } else {
828
                ref_comp += (e - s) + ref_sum;
829
            }
830
            ref_sum = s;
831
        }
832
        let want_sum = ref_sum + ref_comp;
833

834
        let rel = rel_diff(got_sum, want_sum);
835
        assert!(
836
            rel < 1e-15,
837
            "SIMD compensated sum diverges from scalar KBN: got {got_sum} want {want_sum} (rel {rel:e})"
838
        );
839
    }
840

841
    /// Stress test mirroring the `evaluate_into` operating regime on
842
    /// large `n_species`: many values clamped at 700 (their exp
843
    /// overflows would-be in libm but stays at f64-max in our
844
    /// polynomial), interleaved with mid- and small-magnitude values
845
    /// that span the full domain. This is the case that tripped up
846
    /// the COFFEE testcase 0 Linear path.
847
    #[test]
848
    fn min_clamp_exp_inplace_sum_stress_large() {
1✔
849
        let k = Kernels::new();
1✔
850
        let log_c_clamp = 700.0;
1✔
851
        // 54k species, with a non-trivial spread: half clamped, the
852
        // other half with t in [-200, 200] picked to give exp values
853
        // at every magnitude scale.
854
        let mut t = Vec::with_capacity(54218);
1✔
855
        for i in 0..54218usize {
54,218✔
856
            let phase = (i as f64) * 0.123;
54,218✔
857
            let raw = if i % 2 == 0 {
54,218✔
858
                800.0 + (phase.sin() * 100.0)
27,109✔
859
            } else {
860
                ((phase * 1.7).cos()) * 200.0
27,109✔
861
            };
862
            t.push(raw);
54,218✔
863
        }
864
        let mut want_buf = t.clone();
1✔
865
        let want_sum: f64 = {
1✔
866
            for x in want_buf.iter_mut() {
54,218✔
867
                *x = x.min(log_c_clamp).exp();
54,218✔
868
            }
54,218✔
869
            want_buf.iter().sum()
1✔
870
        };
871
        let got_sum = k.min_clamp_exp_inplace_sum(&mut t, log_c_clamp);
1✔
872

873
        // Both paths can produce the same overflow-to-+inf sum; what we
874
        // require is that the kernel does not invent a different f64
875
        // class (e.g. swap +inf for finite-but-large, or NaN, or -inf).
876
        assert_eq!(
1✔
877
            want_sum.is_infinite(),
1✔
878
            got_sum.is_infinite(),
1✔
879
            "infinite-class mismatch: want {want_sum:e}, got {got_sum:e}"
880
        );
881
        assert_eq!(
1✔
882
            want_sum.is_sign_positive(),
1✔
883
            got_sum.is_sign_positive(),
1✔
884
            "sign mismatch: want {want_sum:e}, got {got_sum:e}"
885
        );
886
        if want_sum.is_finite() && got_sum.is_finite() {
1✔
887
            let rel = rel_diff(got_sum, want_sum);
888
            assert!(rel < 1e-12, "rel diff {rel:e}");
889
        }
1✔
890
        // Also: every per-element exp must agree to <= 4 ulps.
891
        for (i, (&got, &want)) in t.iter().zip(want_buf.iter()).enumerate() {
54,218✔
892
            let ulps = ulp_diff(got, want);
54,218✔
893
            assert!(
54,218✔
894
                ulps <= 4,
54,218✔
895
                "elem {i}: t_in (post-clamp limit {log_c_clamp}); got {got:e} want {want:e} ({ulps} ulps)"
896
            );
897
        }
898
    }
1✔
899

900
    /// Sweep `simd_exp` across the full operating-point range we feed
901
    /// it from `evaluate_into` and `evaluate_log_into` and assert it
902
    /// stays close to libm. This is the canary that protects against
903
    /// silent regressions in the polynomial / range reduction.
904
    #[test]
905
    #[cfg(feature = "simd")]
906
    fn simd_exp_accuracy_sweep() {
907
        let k = Kernels::new();
908
        let mut max_ulps = 0u64;
909
        let mut max_rel = 0.0_f64;
910
        let mut worst_x = 0.0_f64;
911
        // Range spans the full operating envelope:
912
        // - log-path inputs after `t - t_max` shift are in (-∞, 0].
913
        // - linear-path inputs are clamped to <= log_c_clamp = 700.
914
        // - solver intermediate iterates can hit anywhere in [-700, 700].
915
        //
916
        // The underflow tail down to -750 is the regression bait: for
917
        // `t < -log(2) * 1022 ≈ -708`, the true `exp(t)` underflows to
918
        // a denormal/zero, which is also what the polynomial+ldexp
919
        // path must produce. An earlier version of this kernel
920
        // mishandled negative `k_clamped + 1023` by sign-extending into
921
        // the f64 sign bit, producing a huge negative value instead of
922
        // zero; testcase 0 of the COFFEE bench reliably tripped it.
923
        let xs: Vec<f64> = (-7500..=7000)
924
            .step_by(1)
925
            .map(|i| (i as f64) / 10.0)
926
            .collect();
927
        let mut t = xs.clone();
928
        // Use min_clamp_exp_inplace_sum to drive the SIMD exp; clamp
929
        // higher than max input so it's a no-op gate.
930
        let _ = k.min_clamp_exp_inplace_sum(&mut t, 1e9);
931
        for (&x, &got) in xs.iter().zip(t.iter()) {
932
            let want = x.exp();
933
            let ulps = ulp_diff(got, want);
934
            let rel = rel_diff(got, want);
935
            if ulps > max_ulps {
936
                max_ulps = ulps;
937
                worst_x = x;
938
            }
939
            if rel > max_rel {
940
                max_rel = rel;
941
            }
942
        }
943
        // 4 ulps would be acceptable for the trust-region ρ check; we
944
        // expect <= 2 with a degree-12 minimax. Set a generous gate
945
        // here so the canary fires only on real regressions.
946
        assert!(
947
            max_ulps <= 4,
948
            "max ulp error {max_ulps} (worst input x={worst_x}, max rel diff {max_rel:e})"
949
        );
950
    }
951

952
    /// LSE invariants we rely on in `evaluate_log_into`:
953
    /// 1. `lse = t_max + ln(sum_shifted)` reproduces `ln Σ exp(t)` to
954
    ///    machine precision (the whole point of the shift).
955
    /// 2. The result is identical between `fused_lse_and_exp_clamp`
956
    ///    and `lse_sum` on the same inputs (so candidate evaluator
957
    ///    and full evaluator agree on `f` for the trust-region ρ test).
958
    #[test]
959
    fn lse_self_consistency() {
1✔
960
        let k = Kernels::new();
1✔
961
        let inputs = sample_inputs();
1✔
962
        let log_c_clamp = 50.0_f64;
1✔
963
        let (t_max_a, sum_a) = k.lse_sum(&inputs);
1✔
964
        let mut buf = inputs.clone();
1✔
965
        let (t_max_b, sum_b) = k.fused_lse_and_exp_clamp(&mut buf, log_c_clamp);
1✔
966
        assert_eq!(t_max_a, t_max_b);
1✔
967
        assert!(rel_diff(sum_a, sum_b) < 1e-14);
1✔
968
    }
1✔
969
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc