• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vigna / sux-rs / 24097483404

07 Apr 2026 06:22PM UTC coverage: 71.015% (-1.4%) from 72.384%
24097483404

push

github

vigna
Serialization for lists

6924 of 9750 relevant lines covered (71.02%)

82523075.84 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

23.3
/src/func/vfunc2.rs
1
/*
2
 * SPDX-FileCopyrightText: 2026 Sebastiano Vigna
3
 *
4
 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
5
 */
6

7
#![allow(clippy::type_complexity)]
8

9
#[cfg(feature = "rayon")]
10
use core::error::Error;
11
use std::borrow::Borrow;
12

13
use super::shard_edge::FuseLge3Shards;
14
use crate::bits::BitFieldVec;
15
use crate::func::VFunc;
16
use crate::func::shard_edge::ShardEdge;
17
use crate::traits::Word;
18
use crate::utils::*;
19
use mem_dbg::*;
20
use num_primitive::{PrimitiveNumber, PrimitiveNumberAs};
21
use value_traits::slices::SliceByValue;
22

23
/// A two-step static function that stores frequent values in a narrow first
24
/// function and infrequent values in a wider second function, with
25
/// frequency-based remapping.
26
///
27
/// During construction, the value distribution is analyzed and the most frequent
28
/// values are assigned compact indices (0 . . 2*ʳ* − 1). A first ("short")
29
/// [`VFunc`] maps every key either to one of these compact indices or to the
30
/// escape sentinel 2*ʳ* − 1. For escaped keys a second ("long") [`VFunc`],
31
/// built only over the escaped subset, stores the full value. A small `remap`
32
/// table converts compact indices back to the original values.
33
///
34
/// The split point bit width of the short function *r* is chosen to minimize
35
/// the total estimated space.
36
///
37
/// If the distribution of output values is very skewed, this function will use
38
/// much less space than a [`VFunc`]. The impact on query time is limited to the
39
/// additional access to the long function; sometimes, the reduction in space
40
/// can even lead to faster queries due to better cache locality.
41
///
42
/// This structure implements the [`TryIntoUnaligned`] trait, allowing it to be
43
/// converted into (usually faster) structures using unaligned access.
44
///
45
/// # Generics
46
///
47
/// * `T`: The type of the keys.
48
/// * `W`: The word used to store the data, which is also the output type. It
49
///   can be any unsigned type. Defaults to `usize`.
50
/// * `D`: The backend storing the function data. Defaults to
51
///   `BitFieldVec<Box<[W]>>`.
52
/// * `S`: The signature type. The default is `[u64; 2]`.
53
/// * `E0`: The sharding and edge logic type for the short (frequent-value)
54
///   function. The default is [`FuseLge3Shards`].
55
/// * `E1`: The sharding and edge logic type for the long (escaped-value)
56
///   function. The default is `E0`.
57
///
58
/// # References
59
///
60
/// Djamal Belazzougui, Paolo Boldi, Rasmus Pagh, and Sebastiano Vigna. [Theory
61
/// and practice of monotone minimal perfect
62
/// hashing](https://doi.org/10.1145/1963190.2025378). *ACM Journal of
63
/// Experimental Algorithmics*, 16(3):3.2:1−3.2:26, 2011.
64
#[derive(MemDbg, MemSize)]
65
#[cfg_attr(feature = "epserde", derive(epserde::Epserde))]
66
#[cfg_attr(
67
    feature = "epserde",
68
    epserde(bound(
69
        deser = "D::Value: for<'a> epserde::deser::DeserInner<DeserType<'a> = D::Value>"
70
    ))
71
)]
72
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
73
#[cfg_attr(
74
    feature = "serde",
75
    serde(bound(
76
        serialize = "D: serde::Serialize, D::Value: serde::Serialize, E0: serde::Serialize, E1: serde::Serialize",
77
        deserialize = "D: serde::Deserialize<'de>, D::Value: serde::Deserialize<'de>, E0: serde::Deserialize<'de>, E1: serde::Deserialize<'de>"
78
    ))
79
)]
80
pub struct VFunc2<T: ?Sized, D: SliceByValue, S = [u64; 2], E0 = FuseLge3Shards, E1 = E0> {
81
    /// First function: maps each key to a remapped index (*r* bits), or
82
    /// `escape` for infrequent values. When *r* = 0 this is an empty
83
    /// VFunc that always returns 0 = `escape`, so the long function is
84
    /// always queried.
85
    pub(crate) short: VFunc<T, D, S, E0>,
86
    /// Second function: maps escaped keys to their full value.
87
    pub(crate) long: VFunc<T, D, S, E1>,
88
    /// Maps remapped indices (0 . . `escape` − 1) back to actual values.
89
    pub(crate) remap: Box<[D::Value]>,
90
    /// The escape value (2*ʳ* − 1). When *r* = 0, `escape` = 0 and the
91
    /// short function always returns the escape.
92
    pub(crate) escape: D::Value,
93
}
94

95
impl<T: ?Sized, D: SliceByValue, S, E0, E1> std::fmt::Debug for VFunc2<T, D, S, E0, E1>
96
where
97
    D::Value: std::fmt::Debug,
98
    VFunc<T, D, S, E0>: std::fmt::Debug,
99
    VFunc<T, D, S, E1>: std::fmt::Debug,
100
{
101
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
×
102
        f.debug_struct("VFunc2")
×
103
            .field("short", &self.short)
×
104
            .field("long", &self.long)
×
105
            .field("remap", &self.remap)
×
106
            .field("escape", &self.escape)
×
107
            .finish()
108
    }
109
}
110

111
impl<T: ?Sized, W: Word, S: Sig, E0: ShardEdge<S, 3>, E1: ShardEdge<S, 3>>
112
    VFunc2<T, BitFieldVec<Box<[W]>>, S, E0, E1>
113
{
114
    /// Creates a VFunc2 with zero keys.
115
    ///
116
    /// With `escape = 0`, the short function always returns the escape,
117
    /// so `get` always queries the long function (which returns zero
118
    /// since both internal [`VFunc`]s are empty).
119
    pub fn empty() -> Self {
×
120
        Self {
121
            short: VFunc::empty(),
×
122
            long: VFunc::empty(),
×
123
            remap: Box::new([]),
×
124
            escape: W::ZERO,
125
        }
126
    }
127
}
128

129
impl<
130
    T: ?Sized + ToSig<S>,
131
    D: SliceByValue<Value: Word + BinSafe + PrimitiveNumberAs<usize>>,
132
    S: Sig,
133
    E0: ShardEdge<S, 3>,
134
    E1: ShardEdge<S, 3>,
135
> VFunc2<T, D, S, E0, E1>
136
{
137
    /// Retrieves the value for a key given its pre-computed signature.
138
    ///
139
    /// The signature must have been computed with `self.short.seed`
140
    /// (i.e., `T::to_sig(key, self.short.seed)`). Both the short and
141
    /// long internal functions share the same seed.
142
    ///
143
    /// This method is mainly useful in the construction of compound
144
    /// functions.
145
    #[inline]
146
    pub fn get_by_sig(&self, sig: S) -> D::Value {
×
147
        let idx = self.short.get_by_sig(sig);
×
148
        if idx != self.escape {
×
149
            self.remap[idx.as_to::<usize>()]
×
150
        } else {
151
            self.long.get_by_sig(sig)
×
152
        }
153
    }
154

155
    /// Retrieves the value associated with the given key, or an arbitrary
156
    /// value if the key was not in the original set.
157
    #[inline(always)]
158
    pub fn get(&self, key: impl Borrow<T>) -> D::Value {
×
159
        self.get_by_sig(T::to_sig(key.borrow(), self.short.seed))
×
160
    }
161

162
    /// Returns the number of keys in the function.
163
    pub fn len(&self) -> usize {
×
164
        self.short.num_keys
×
165
    }
166

167
    /// Returns whether the function has no keys.
168
    pub fn is_empty(&self) -> bool {
×
169
        self.short.num_keys == 0
×
170
    }
171
}
172

173
// ── Aligned ↔ Unaligned conversions ─────────────────────────────────
174

175
use crate::bits::BitFieldVecU;
176
use crate::traits::TryIntoUnaligned;
177

178
impl<T: ?Sized, W: Word, S: Sig, E0: ShardEdge<S, 3>, E1: ShardEdge<S, 3>> TryIntoUnaligned
179
    for VFunc2<T, BitFieldVec<Box<[W]>>, S, E0, E1>
180
{
181
    type Unaligned = VFunc2<T, BitFieldVecU<Box<[W]>>, S, E0, E1>;
182
    fn try_into_unaligned(
×
183
        self,
184
    ) -> Result<Self::Unaligned, crate::traits::UnalignedConversionError> {
185
        Ok(VFunc2 {
×
186
            short: self.short.try_into_unaligned()?,
×
187
            long: self.long.try_into_unaligned()?,
×
188
            remap: self.remap,
×
189
            escape: self.escape,
×
190
        })
191
    }
192
}
193

194
impl<T: ?Sized, W: Word, S: Sig, E0: ShardEdge<S, 3>, E1: ShardEdge<S, 3>>
195
    From<VFunc2<T, BitFieldVecU<Box<[W]>>, S, E0, E1>>
196
    for VFunc2<T, BitFieldVec<Box<[W]>>, S, E0, E1>
197
{
198
    /// Converts a [`VFunc2`] with [`BitFieldVecU`] data back into
199
    /// one with [`BitFieldVec`] data.
200
    fn from(vf: VFunc2<T, BitFieldVecU<Box<[W]>>, S, E0, E1>) -> Self {
×
201
        VFunc2 {
202
            short: VFunc::from(vf.short),
×
203
            long: VFunc::from(vf.long),
×
204
            remap: vf.remap,
×
205
            escape: vf.escape,
×
206
        }
207
    }
208
}
209

210
#[cfg(feature = "rayon")]
211
/// A map from `K` to `V` backed by a flat array for small keys and a
212
/// [`HashMap`] for large keys.
213
pub(crate) struct HybridMap<K, V> {
214
    array: Vec<V>,
215
    map: std::collections::HashMap<K, V>,
216
    default: V,
217
}
218

219
#[cfg(feature = "rayon")]
220
impl<K: Word + PrimitiveNumberAs<usize>, V: Copy + Eq> HybridMap<K, V> {
221
    /// Creates a new hybrid map.
222
    ///
223
    /// * `max_key` — optional upper bound on keys. When provided,
224
    ///   the array is capped at `max_key + 1`.
225
    /// * `default` — value returned for absent keys.
226
    pub(crate) fn new(max_key: Option<K>, default: V) -> Self {
99✔
227
        let mut array_len = 1 << 10;
198✔
228
        if let Some(mk) = max_key {
165✔
229
            array_len = array_len.min(mk.as_to::<usize>() + 1);
99✔
230
        }
231
        Self {
232
            array: vec![default; array_len],
396✔
233
            map: std::collections::HashMap::new(),
99✔
234
            default,
235
        }
236
    }
237

238
    pub(crate) fn insert(&mut self, key: K, value: V) {
70✔
239
        let k: usize = key.as_to();
280✔
240
        if k < self.array.len() {
210✔
241
            self.array[k] = value;
70✔
242
        } else {
243
            self.map.insert(key, value);
×
244
        }
245
    }
246

247
    #[inline(always)]
248
    pub(crate) fn get(&self, key: K) -> V {
982✔
249
        let k: usize = key.as_to();
3,928✔
250
        if k < self.array.len() {
1,964✔
251
            self.array[k]
982✔
252
        } else {
253
            self.map.get(&key).copied().unwrap_or(self.default)
×
254
        }
255
    }
256

257
    /// Returns keys whose value differs from the default, sorted by
258
    /// descending value (requires `V: Ord`).
259
    pub(crate) fn keys_by_desc_value(&self) -> Vec<K>
33✔
260
    where
261
        V: Ord,
262
    {
263
        let array_iter = self
66✔
264
            .array
33✔
265
            .iter()
266
            .enumerate()
267
            .filter(|&(_, v)| *v != self.default)
67,617✔
268
            .map(|(k, _)| K::try_from(k).ok().unwrap());
745✔
269
        let map_iter = self.map.keys().copied();
132✔
270
        let mut keys: Vec<K> = array_iter.chain(map_iter).collect();
198✔
271
        keys.sort_by_key(|b| std::cmp::Reverse(self.get(*b)));
3,012✔
272
        keys
33✔
273
    }
274
}
275

276
#[cfg(feature = "rayon")]
277
impl<K: Word + PrimitiveNumberAs<usize>> HybridMap<K, usize> {
278
    #[inline(always)]
279
    pub(crate) fn incr(&mut self, key: K) {
×
280
        self.add(key, 1);
×
281
    }
282

283
    #[inline(always)]
284
    pub(crate) fn add(&mut self, key: K, amount: usize) {
2,274✔
285
        let k: usize = key.as_to();
9,096✔
286
        if k < self.array.len() {
6,822✔
287
            self.array[k] += amount;
2,274✔
288
        } else {
289
            *self.map.entry(key).or_insert(0) += amount;
×
290
        }
291
    }
292
}
293

294
/// Finds the optimal short-function bit width `r` for a [`VFunc2`],
295
/// minimizing the estimated total space.
296
///
297
/// `sorted_vals` must be the distinct values sorted by descending
298
/// frequency. `count_of` returns the frequency for a given value.
299
#[cfg(feature = "rayon")]
300
pub(crate) fn find_optimal_r<W: Word>(
33✔
301
    n: usize,
302
    max_value: W,
303
    sorted_vals: &[W],
304
    count_of: impl Fn(W) -> usize,
305
    w_bits: usize,
306
) -> usize {
307
    let w = max_value.bit_len() as usize;
66✔
308
    let m = sorted_vals.len();
99✔
309
    let c = 1.11f64; // VFunc expansion factor (approximate)
66✔
310

311
    let mut post = n;
66✔
312
    let mut pos = 0usize;
66✔
313
    let mut best_r = 0usize;
66✔
314
    let mut best_cost = f64::MAX;
66✔
315

316
    // r < w <= 64, so 1usize << r never overflows.
317
    for r in 0..w {
191✔
318
        let cost_first = if r == 0 { 0.0 } else { c * n as f64 * r as f64 };
474✔
319
        let cost_second = c * post as f64 * w as f64;
316✔
320
        let cost_remap = pos as f64 * w_bits as f64;
316✔
321
        let cost = cost_first + cost_second + cost_remap;
316✔
322

323
        if cost < best_cost {
231✔
324
            best_cost = cost;
73✔
325
            best_r = r;
73✔
326
        }
327

328
        let to_absorb = (1usize << r).min(m - pos);
632✔
329
        for _ in 0..to_absorb {
336✔
330
            post -= count_of(sorted_vals[pos]);
178✔
331
            pos += 1;
178✔
332
        }
333
    }
334

335
    best_r
33✔
336
}
337

338
#[cfg(feature = "rayon")]
339
use {
340
    crate::func::VBuilder,
341
    dsi_progress_logger::ProgressLog,
342
    lender::*,
343
    rdst::RadixKey,
344
    std::ops::{BitXor, BitXorAssign},
345
    sync_cell_slice::SyncSlice,
346
};
347

348
#[cfg(feature = "rayon")]
349
impl<T, W, S, E0, E1> VFunc2<T, BitFieldVec<Box<[W]>>, S, E0, E1>
350
where
351
    T: ?Sized + ToSig<S> + std::fmt::Debug,
352
    W: Word + BinSafe + MemSize + mem_dbg::FlatType,
353
    S: Sig + Send + Sync,
354
    E0: ShardEdge<S, 3> + MemSize + mem_dbg::FlatType,
355
    E1: ShardEdge<S, 3> + MemSize + mem_dbg::FlatType,
356
    Box<[W]>: MemSize,
357
    SigVal<S, W>: RadixKey,
358
    SigVal<E0::LocalSig, W>: BitXor + BitXorAssign,
359
    SigVal<E1::LocalSig, W>: BitXor + BitXorAssign,
360
{
361
    /// Builds a [`VFunc2`] from keys and values using default
362
    /// [`VBuilder`] settings.
363
    ///
364
    /// * `keys` and `values` must be aligned (one value per key, same
365
    ///   order) and rewindable (they may be rewound on retry).
366
    /// * `n` is the expected number of keys; a significantly wrong
367
    ///   value may degrade performance or cause extra retries.
368
    ///
369
    /// This is a convenience wrapper around
370
    /// [`try_new_with_builder`](Self::try_new_with_builder) with
371
    /// `VBuilder::default()`.
372
    ///
373
    /// # Examples
374
    ///
375
    /// ```rust
376
    /// # #[cfg(feature = "rayon")]
377
    /// # fn main() -> anyhow::Result<()> {
378
    /// # use sux::func::VFunc2;
379
    /// # use sux::bits::BitFieldVec;
380
    /// # use sux::traits::TryIntoUnaligned;
381
    /// # use dsi_progress_logger::no_logging;
382
    /// # use sux::utils::FromCloneableIntoIterator;
383
    /// let func = <VFunc2<usize, BitFieldVec<Box<[usize]>>>>::try_new(
384
    ///     FromCloneableIntoIterator::new(0..100_usize),
385
    ///     FromCloneableIntoIterator::new(0..100_usize),
386
    ///     100,
387
    ///     no_logging![],
388
    /// )?.try_into_unaligned()?;
389
    ///
390
    /// for i in 0..100 {
391
    ///     assert_eq!(func.get(&i), i);
392
    /// }
393
    /// # Ok(())
394
    /// # }
395
    /// # #[cfg(not(feature = "rayon"))]
396
    /// # fn main() {}
397
    /// ```
398
    pub fn try_new<B: ?Sized + std::borrow::Borrow<T>>(
×
399
        keys: impl FallibleRewindableLender<
400
            RewindError: Error + Send + Sync + 'static,
401
            Error: Error + Send + Sync + 'static,
402
        > + for<'lend> FallibleLending<'lend, Lend = &'lend B>,
403
        values: impl FallibleRewindableLender<
404
            RewindError: Error + Send + Sync + 'static,
405
            Error: Error + Send + Sync + 'static,
406
        > + for<'lend> FallibleLending<'lend, Lend = &'lend W>,
407
        n: usize,
408
        pl: &mut (impl ProgressLog + Clone + Send + Sync),
409
    ) -> anyhow::Result<Self> {
410
        Self::try_new_with_builder(keys, values, n, VBuilder::default(), pl)
×
411
    }
412

413
    /// Builds a [`VFunc2`] from keys and values using the given
414
    /// [`VBuilder`] configuration.
415
    ///
416
    /// * `keys` and `values` must be aligned (one value per key, same
417
    ///   order) and rewindable (they may be rewound on retry).
418
    /// * `n` is the expected number of keys.
419
    ///
420
    /// The builder controls construction parameters such as [offline
421
    /// mode](VBuilder::offline), [thread count](VBuilder::max_num_threads),
422
    /// [sharding overhead](VBuilder::eps), and [PRNG seed](VBuilder::seed).
423
    ///
424
    /// # Examples
425
    ///
426
    /// ```rust
427
    /// # #[cfg(feature = "rayon")]
428
    /// # fn main() -> anyhow::Result<()> {
429
    /// # use sux::func::{VBuilder, VFunc2};
430
    /// # use sux::bits::BitFieldVec;
431
    /// # use sux::traits::TryIntoUnaligned;
432
    /// # use dsi_progress_logger::no_logging;
433
    /// # use sux::utils::FromCloneableIntoIterator;
434
    /// let func = <VFunc2<usize, BitFieldVec<Box<[usize]>>>>::try_new_with_builder(
435
    ///     FromCloneableIntoIterator::new(0..100_usize),
436
    ///     FromCloneableIntoIterator::new(0..100_usize),
437
    ///     100,
438
    ///     VBuilder::default().offline(true),
439
    ///     no_logging![],
440
    /// )?.try_into_unaligned()?;
441
    ///
442
    /// for i in 0..100 {
443
    ///     assert_eq!(func.get(&i), i);
444
    /// }
445
    /// # Ok(())
446
    /// # }
447
    /// # #[cfg(not(feature = "rayon"))]
448
    /// # fn main() {}
449
    /// ```
450
    pub fn try_new_with_builder<B: ?Sized + std::borrow::Borrow<T>>(
×
451
        keys: impl FallibleRewindableLender<
452
            RewindError: Error + Send + Sync + 'static,
453
            Error: Error + Send + Sync + 'static,
454
        > + for<'lend> FallibleLending<'lend, Lend = &'lend B>,
455
        values: impl FallibleRewindableLender<
456
            RewindError: Error + Send + Sync + 'static,
457
            Error: Error + Send + Sync + 'static,
458
        > + for<'lend> FallibleLending<'lend, Lend = &'lend W>,
459
        n: usize,
460
        builder: VBuilder<BitFieldVec<Box<[W]>>, S, E0>,
461
        pl: &mut (impl ProgressLog + Clone + Send + Sync),
462
    ) -> anyhow::Result<Self> {
463
        let mut builder = builder.expected_num_keys(n);
×
464
        builder
×
465
            .try_populate_and_build(
466
                keys,
×
467
                values,
×
468
                &mut |builder, seed, mut store, _max_value, _num_keys, pl, _state: &mut ()| {
×
469
                    let mut inner_builder = VBuilder::default()
×
470
                        .max_num_threads(builder.max_num_threads)
×
471
                        .offline(builder.offline);
×
472
                    inner_builder.eps = builder.eps;
×
473
                    if let Some(low_mem) = builder.low_mem {
×
474
                        inner_builder = inner_builder.low_mem(low_mem);
×
475
                    }
476
                    Self::try_build_from_store::<W>(
×
477
                        seed,
×
478
                        builder.shard_edge,
×
479
                        &mut *store,
×
480
                        &|v| v,
×
481
                        inner_builder,
×
482
                        pl,
×
483
                    )
484
                },
485
                pl,
×
486
                (),
487
            )
488
            .map(|(r, _keys)| r)
×
489
    }
490

491
    /// Builds a [`VFunc2`] from an existing [`ShardStore`].
492
    ///
493
    /// This is the low-level constructor used when multiple VFuncs are
494
    /// built from the same store (e.g., inside
495
    /// [`Lcp2Mmphf`](crate::func::Lcp2Mmphf)). Use [`try_new`](Self::try_new)
496
    /// or [`try_new_with_builder`](Self::try_new_with_builder) for the
497
    /// common case of building from keys and values directly.
498
    ///
499
    /// # Preconditions
500
    ///
501
    /// * `seed` and `shard_edge` must be the values used when the store
502
    ///   was populated.
503
    /// * `get_val` must be deterministic: the store is iterated multiple
504
    ///   times (frequency analysis, then short/long construction) and
505
    ///   differing results corrupt the function silently.
506
    ///
507
    /// # Arguments
508
    ///
509
    /// * `seed` — the seed from the store's population step.
510
    /// * `shard_edge` — the shard edge from the same population step.
511
    /// * `store` — the populated shard store.
512
    /// * `get_val` — extracts the value from the store's packed entry
513
    ///   (e.g., `|v| v >> log2_bs` for LCP lengths).
514
    /// * `builder` — the builder configuration for the internal VFuncs.
515
    /// * `pl` — a progress logger.
516
    pub fn try_build_from_store<V: BinSafe + Default + Send + Sync + Copy>(
×
517
        seed: u64,
518
        shard_edge: E0,
519
        store: &mut (impl ShardStore<S, V> + ?Sized),
520
        get_val: &(impl Fn(V) -> W + Send + Sync),
521
        builder: VBuilder<BitFieldVec<Box<[W]>>, S, E0>,
522
        pl: &mut (impl ProgressLog + Clone + Send + Sync),
523
    ) -> anyhow::Result<Self>
524
    where
525
        SigVal<S, V>: RadixKey,
526
        SigVal<E0::LocalSig, V>: BitXor + BitXorAssign,
527
        SigVal<E1::LocalSig, V>: BitXor + BitXorAssign,
528
    {
529
        // -- Frequency analysis (single pass) --
530

531
        let mut max_value = W::ZERO;
×
532
        let mut counts: HybridMap<W, usize> = HybridMap::new(None, 0);
×
533
        for shard in store.iter() {
×
534
            for sv in shard.iter() {
×
535
                let val = get_val(sv.val);
×
536
                if val > max_value {
×
537
                    max_value = val;
×
538
                }
539
                counts.incr(val);
×
540
            }
541
        }
542

543
        Self::build_from_hybrid_counts(
544
            seed, shard_edge, store, get_val, max_value, counts, builder, pl,
×
545
        )
546
    }
547

548
    /// Core two-step build logic using [`HybridMap`] for O(1) lookups
549
    /// on common value ranges.
550
    fn build_from_hybrid_counts<V: BinSafe + Default + Send + Sync + Copy>(
×
551
        seed: u64,
552
        shard_edge: E0,
553
        store: &mut (impl ShardStore<S, V> + ?Sized),
554
        get_val: &(impl Fn(V) -> W + Send + Sync),
555
        max_value: W,
556
        counts: HybridMap<W, usize>,
557
        builder: VBuilder<BitFieldVec<Box<[W]>>, S, E0>,
558
        pl: &mut (impl ProgressLog + Clone + Send + Sync),
559
    ) -> anyhow::Result<Self>
560
    where
561
        SigVal<S, V>: RadixKey,
562
        SigVal<E0::LocalSig, V>: BitXor + BitXorAssign,
563
        SigVal<E1::LocalSig, V>: BitXor + BitXorAssign,
564
    {
565
        // -- Sort distinct values by descending frequency --
566

567
        let sorted_vals: Vec<W> = counts.keys_by_desc_value();
×
568

569
        let w = max_value.bit_len() as usize;
×
570
        let m = sorted_vals.len();
×
571

572
        // -- Find optimal r --
573

574
        let n = store.len();
×
575
        let best_r = find_optimal_r(
576
            n,
×
577
            max_value,
×
578
            &sorted_vals,
×
579
            |v| counts.get(v),
×
580
            W::BITS as usize,
×
581
        );
582

583
        let escape_usize = (1usize << best_r).wrapping_sub(1); // 2^r - 1
×
584
        let escape = W::try_from(escape_usize).ok().unwrap();
×
585
        let num_remapped = escape_usize.min(m);
×
586

587
        // -- Build remap and inv_map --
588

589
        let remap: Box<[W]> = sorted_vals[..num_remapped].to_vec().into_boxed_slice();
×
590
        let mut inv_map: HybridMap<W, W> = HybridMap::new(Some(max_value), escape);
×
591
        for (i, &val) in remap.iter().enumerate() {
×
592
            inv_map.insert(val, W::try_from(i).ok().unwrap());
×
593
        }
594

595
        pl.info(format_args!(
×
596
            "Two-step: r={best_r}, escape={escape_usize}, {num_remapped} remapped values, \
×
597
             {m} distinct values, max_value={max_value} ({w} bits)",
×
598
        ));
599

600
        // -- Build short VFunc --
601
        // When r = 0, escape = 0 and the short function maps every key to
602
        // 0 = escape, so the long function is always queried.
603

604
        // Set up per-shard counters for escaped entries, using the
605
        // store's current shard granularity.
606
        let shb = shard_edge.shard_high_bits();
×
607
        let num_shards = 1usize << shb;
×
608
        let shard_mask = (1u64 << shb) - 1;
×
609
        let mut escaped_counts = vec![0usize; num_shards];
×
610
        let sync_counts = escaped_counts.as_sync_slice();
×
611

612
        // Save builder settings before the short VFunc consumes it.
613
        let saved_max_num_threads = builder.max_num_threads;
×
614
        let saved_eps = builder.eps;
×
615
        let saved_offline = builder.offline;
×
616
        let saved_low_mem = builder.low_mem;
×
617

618
        pl.info(format_args!(
×
619
            "Building key -> remapped index ({best_r} bits, escape={escape_usize})..."
×
620
        ));
621
        let short = builder.try_build_func_with_store_and_inspect::<T, V>(
×
622
            seed,
×
623
            shard_edge,
×
624
            escape,
×
625
            store,
×
626
            &|_e, sig_val| inv_map.get(get_val(sig_val.val)),
×
627
            &|sv: &SigVal<S, V>| {
×
628
                if inv_map.get(get_val(sv.val)) == escape {
×
629
                    let shard_idx = sv.sig.high_bits(shb, shard_mask) as usize;
×
630
                    // SAFETY: each shard is processed by exactly one
631
                    // thread, so no two threads access the same counter.
632
                    unsafe {
×
633
                        let c = sync_counts[shard_idx].get();
×
634
                        sync_counts[shard_idx].set(c + 1);
×
635
                    }
636
                }
637
            },
638
            pl,
×
639
        )?;
640

641
        // escaped_counts now has per-shard escaped entry counts.
642

643
        // -- Build long VFunc (escaped keys only) --
644

645
        let n_escaped = n - sorted_vals[..num_remapped]
×
646
            .iter()
×
647
            .map(|&v| counts.get(v))
×
648
            .sum::<usize>();
×
649

650
        debug_assert_eq!(
×
651
            escaped_counts.iter().sum::<usize>(),
×
652
            n_escaped,
×
653
            "inspect-counted escaped != freq-computed escaped"
×
654
        );
655

656
        let mut long_shard_edge = E1::default();
×
657
        long_shard_edge.set_up_shards(n_escaped, saved_eps);
×
658
        let long_shard_high_bits = long_shard_edge.shard_high_bits();
×
659

660
        // Aggregate escaped_counts to the long function's shard granularity.
661
        let long_num_shards = 1usize << long_shard_high_bits;
×
662
        let filtered_shard_sizes = if long_num_shards >= num_shards {
×
663
            escaped_counts
×
664
        } else {
665
            let shards_per_long = num_shards / long_num_shards;
×
666
            escaped_counts
×
667
                .chunks(shards_per_long)
×
668
                .map(|chunk| chunk.iter().sum())
×
669
                .collect()
670
        };
671

672
        pl.info(format_args!(
×
673
            "Building key -> full value ({w} bits, {n_escaped} escaped keys, {:.1}%)...",
×
674
            100.0 * n_escaped as f64 / n as f64
×
675
        ));
676

677
        let mut filtered_store = FilteredShardStore::new(
678
            store,
×
679
            long_shard_high_bits,
×
680
            |sv: &SigVal<S, V>| inv_map.get(get_val(sv.val)) == escape,
×
681
            filtered_shard_sizes,
×
682
        );
683
        let mut long_builder = VBuilder::<BitFieldVec<Box<[W]>>, S, E1>::default()
×
684
            .max_num_threads(saved_max_num_threads)
×
685
            .offline(saved_offline);
×
686
        long_builder.eps = saved_eps;
×
687
        if let Some(low_mem) = saved_low_mem {
×
688
            long_builder = long_builder.low_mem(low_mem);
×
689
        }
690
        let long = long_builder.try_build_func_with_store::<T, V>(
×
691
            seed,
×
692
            long_shard_edge,
×
693
            max_value,
×
694
            &mut filtered_store,
×
695
            &|_e, sig_val| get_val(sig_val.val),
×
696
            pl,
×
697
        )?;
698

699
        let result = Self {
700
            short,
701
            long,
702
            remap,
703
            escape,
704
        };
705
        let n = store.len();
×
706
        let total = result.mem_size(SizeFlags::default()) * 8;
×
707
        pl.info(format_args!(
×
708
            "Bits/keys: {:.2} ({total} bits for {n} keys)",
×
709
            total as f64 / n as f64,
×
710
        ));
711
        Ok(result)
×
712
    }
713
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc