• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16935267080

13 Aug 2025 11:00AM UTC coverage: 24.312% (-63.3%) from 87.658%
16935267080

Pull #4226

github

web-flow
Merge 81b48c7fb into baa6ea202
Pull Request #4226: Support converting TimestampTZ to and from duckdb

0 of 2 new or added lines in 1 file covered. (0.0%)

20666 existing lines in 469 files now uncovered.

8726 of 35892 relevant lines covered (24.31%)

147.74 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

61.67
/vortex-btrblocks/src/integer/stats.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
use std::hash::Hash;
5

6
use arrow_buffer::BooleanBuffer;
7
use num_traits::PrimInt;
8
use rustc_hash::FxBuildHasher;
9
use vortex_array::ToCanonical;
10
use vortex_array::arrays::{NativeValue, PrimitiveArray, PrimitiveVTable};
11
use vortex_array::stats::Stat;
12
use vortex_dtype::{NativePType, match_each_integer_ptype};
13
use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
14
use vortex_mask::AllOr;
15
use vortex_scalar::{PValue, Scalar};
16
use vortex_utils::aliases::hash_map::HashMap;
17

18
use crate::sample::sample;
19
use crate::{CompressorStats, GenerateStatsOptions};
20

21
#[derive(Clone, Debug)]
22
pub struct TypedStats<T> {
23
    pub min: T,
24
    pub max: T,
25
    pub top_value: T,
26
    pub top_count: u32,
27
    pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
28
}
29

30
/// Type-erased container for one of the [TypedStats] variants.
31
///
32
/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
33
/// set of stats. We then perform a variety of access methods on them.
34
#[derive(Clone, Debug)]
35
pub enum ErasedStats {
36
    U8(TypedStats<u8>),
37
    U16(TypedStats<u16>),
38
    U32(TypedStats<u32>),
39
    U64(TypedStats<u64>),
40
    I8(TypedStats<i8>),
41
    I16(TypedStats<i16>),
42
    I32(TypedStats<i32>),
43
    I64(TypedStats<i64>),
44
}
45

46
impl ErasedStats {
47
    pub fn min_is_zero(&self) -> bool {
30✔
48
        match &self {
30✔
UNCOV
49
            ErasedStats::U8(x) => x.min == 0,
×
UNCOV
50
            ErasedStats::U16(x) => x.min == 0,
×
UNCOV
51
            ErasedStats::U32(x) => x.min == 0,
×
52
            ErasedStats::U64(x) => x.min == 0,
6✔
UNCOV
53
            ErasedStats::I8(x) => x.min == 0,
×
UNCOV
54
            ErasedStats::I16(x) => x.min == 0,
×
55
            ErasedStats::I32(x) => x.min == 0,
20✔
56
            ErasedStats::I64(x) => x.min == 0,
4✔
57
        }
58
    }
30✔
59

60
    pub fn min_is_negative(&self) -> bool {
60✔
61
        match &self {
60✔
62
            ErasedStats::U8(_)
63
            | ErasedStats::U16(_)
64
            | ErasedStats::U32(_)
65
            | ErasedStats::U64(_) => false,
12✔
UNCOV
66
            ErasedStats::I8(x) => x.min < 0,
×
UNCOV
67
            ErasedStats::I16(x) => x.min < 0,
×
68
            ErasedStats::I32(x) => x.min < 0,
40✔
69
            ErasedStats::I64(x) => x.min < 0,
8✔
70
        }
71
    }
60✔
72

73
    // Difference between max and min.
74
    pub fn max_minus_min(&self) -> u64 {
12✔
75
        match &self {
12✔
UNCOV
76
            ErasedStats::U8(x) => (x.max - x.min) as u64,
×
UNCOV
77
            ErasedStats::U16(x) => (x.max - x.min) as u64,
×
UNCOV
78
            ErasedStats::U32(x) => (x.max - x.min) as u64,
×
UNCOV
79
            ErasedStats::U64(x) => x.max - x.min,
×
UNCOV
80
            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
×
UNCOV
81
            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
×
82
            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
8✔
83
            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
4✔
84
                .vortex_expect("max minus min result bigger than u64"),
4✔
85
        }
86
    }
12✔
87

88
    /// Get the most commonly occurring value and its count
89
    pub fn top_value_and_count(&self) -> (PValue, u32) {
30✔
90
        match &self {
30✔
UNCOV
91
            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
×
UNCOV
92
            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
×
UNCOV
93
            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
×
94
            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
6✔
UNCOV
95
            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
×
UNCOV
96
            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
×
97
            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
20✔
98
            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
4✔
99
        }
100
    }
30✔
101
}
102

103
macro_rules! impl_from_typed {
104
    ($T:ty, $variant:path) => {
105
        impl From<TypedStats<$T>> for ErasedStats {
106
            fn from(typed: TypedStats<$T>) -> Self {
54✔
107
                $variant(typed)
54✔
108
            }
54✔
109
        }
110
    };
111
}
112

113
impl_from_typed!(u8, ErasedStats::U8);
114
impl_from_typed!(u16, ErasedStats::U16);
115
impl_from_typed!(u32, ErasedStats::U32);
116
impl_from_typed!(u64, ErasedStats::U64);
117
impl_from_typed!(i8, ErasedStats::I8);
118
impl_from_typed!(i16, ErasedStats::I16);
119
impl_from_typed!(i32, ErasedStats::I32);
120
impl_from_typed!(i64, ErasedStats::I64);
121

122
#[derive(Clone, Debug)]
123
pub struct IntegerStats {
124
    pub(super) src: PrimitiveArray,
125
    // cache for validity.false_count()
126
    pub(super) null_count: u32,
127
    // cache for validity.true_count()
128
    pub(super) value_count: u32,
129
    pub(super) average_run_length: u32,
130
    pub(super) distinct_values_count: u32,
131
    pub(crate) typed: ErasedStats,
132
}
133

134
impl CompressorStats for IntegerStats {
135
    type ArrayVTable = PrimitiveVTable;
136

137
    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
54✔
138
        match_each_integer_ptype!(input.ptype(), |T| {
54✔
UNCOV
139
            typed_int_stats::<T>(input, opts.count_distinct_values)
×
140
        })
141
    }
54✔
142

143
    fn source(&self) -> &PrimitiveArray {
220✔
144
        &self.src
220✔
145
    }
220✔
146

147
    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
20✔
148
        let sampled = sample(self.src.as_ref(), sample_size, sample_count)
20✔
149
            .to_primitive()
20✔
150
            .vortex_expect("primitive");
20✔
151

152
        Self::generate_opts(&sampled, opts)
20✔
153
    }
20✔
154
}
155

156
fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
54✔
157
where
54✔
158
    T: NativePType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
54✔
159
    TypedStats<T>: Into<ErasedStats>,
54✔
160
    NativeValue<T>: Eq + Hash,
54✔
161
{
162
    // Special case: empty array
163
    if array.is_empty() {
54✔
164
        return IntegerStats {
×
165
            src: array.clone(),
×
166
            null_count: 0,
×
167
            value_count: 0,
×
168
            average_run_length: 0,
×
169
            distinct_values_count: 0,
×
170
            typed: TypedStats {
×
171
                min: T::max_value(),
×
172
                max: T::min_value(),
×
173
                top_value: T::default(),
×
174
                top_count: 0,
×
175
                distinct_values: HashMap::with_hasher(FxBuildHasher),
×
176
            }
×
177
            .into(),
×
178
        };
×
179
    } else if array.all_invalid().vortex_expect("all_invalid") {
54✔
180
        return IntegerStats {
×
181
            src: array.clone(),
×
182
            null_count: array.len().try_into().vortex_expect("null_count"),
×
183
            value_count: 0,
×
184
            average_run_length: 0,
×
185
            distinct_values_count: 0,
×
186
            typed: TypedStats {
×
187
                min: T::max_value(),
×
188
                max: T::min_value(),
×
189
                top_value: T::default(),
×
190
                top_count: 0,
×
191
                distinct_values: HashMap::with_hasher(FxBuildHasher),
×
192
            }
×
193
            .into(),
×
194
        };
×
195
    }
54✔
196

197
    let validity = array.validity_mask().vortex_expect("logical_validity");
54✔
198
    let null_count = validity.false_count();
54✔
199
    let value_count = validity.true_count();
54✔
200

201
    // Initialize loop state
202
    let head_idx = validity
54✔
203
        .first()
54✔
204
        .vortex_expect("All null masks have been handled before");
54✔
205
    let buffer = array.buffer::<T>();
54✔
206
    let head = buffer[head_idx];
54✔
207

208
    let mut loop_state = LoopState {
54✔
209
        distinct_values: if count_distinct_values {
54✔
210
            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
50✔
211
        } else {
212
            HashMap::with_hasher(FxBuildHasher)
4✔
213
        },
214
        prev: head,
54✔
215
        runs: 1,
216
    };
217

218
    let sliced = buffer.slice(head_idx..array.len());
54✔
219
    let mut chunks = sliced.as_slice().chunks_exact(64);
54✔
220
    match validity.boolean_buffer() {
54✔
221
        AllOr::All => {
222
            for chunk in &mut chunks {
582✔
223
                inner_loop_nonnull(
528✔
224
                    chunk.try_into().vortex_unwrap(),
528✔
225
                    count_distinct_values,
528✔
226
                    &mut loop_state,
528✔
227
                )
228
            }
229
            let remainder = chunks.remainder();
54✔
230
            inner_loop_naive(
54✔
231
                remainder,
54✔
232
                count_distinct_values,
54✔
233
                &BooleanBuffer::new_set(remainder.len()),
54✔
234
                &mut loop_state,
54✔
235
            );
236
        }
237
        AllOr::None => unreachable!("All invalid arrays have been handled before"),
×
UNCOV
238
        AllOr::Some(v) => {
×
UNCOV
239
            let mask = v.slice(head_idx, array.len() - head_idx);
×
UNCOV
240
            let mut offset = 0;
×
UNCOV
241
            for chunk in &mut chunks {
×
UNCOV
242
                let validity = mask.slice(offset, 64);
×
UNCOV
243
                offset += 64;
×
244

UNCOV
245
                match validity.count_set_bits() {
×
246
                    // All nulls -> no stats to update
247
                    0 => continue,
×
248
                    // Inner loop for when validity check can be elided
249
                    64 => inner_loop_nonnull(
×
250
                        chunk.try_into().vortex_unwrap(),
×
251
                        count_distinct_values,
×
252
                        &mut loop_state,
×
253
                    ),
254
                    // Inner loop for when we need to check validity
UNCOV
255
                    _ => inner_loop_nullable(
×
UNCOV
256
                        chunk.try_into().vortex_unwrap(),
×
UNCOV
257
                        count_distinct_values,
×
UNCOV
258
                        &validity,
×
UNCOV
259
                        &mut loop_state,
×
260
                    ),
261
                }
262
            }
263
            // Final iteration, run naive loop
UNCOV
264
            let remainder = chunks.remainder();
×
UNCOV
265
            inner_loop_naive(
×
UNCOV
266
                remainder,
×
UNCOV
267
                count_distinct_values,
×
UNCOV
268
                &mask.slice(offset, remainder.len()),
×
UNCOV
269
                &mut loop_state,
×
270
            );
271
        }
272
    }
273

274
    let (top_value, top_count) = if count_distinct_values {
54✔
275
        let (&top_value, &top_count) = loop_state
50✔
276
            .distinct_values
50✔
277
            .iter()
50✔
278
            .max_by_key(|&(_, &count)| count)
50✔
279
            .vortex_expect("non-empty");
50✔
280
        (top_value.0, top_count)
50✔
281
    } else {
282
        (T::default(), 0)
4✔
283
    };
284

285
    let runs = loop_state.runs;
54✔
286
    let distinct_values_count = if count_distinct_values {
54✔
287
        loop_state.distinct_values.len().try_into().vortex_unwrap()
50✔
288
    } else {
289
        u32::MAX
4✔
290
    };
291

292
    let min = array
54✔
293
        .statistics()
54✔
294
        .compute_as::<T>(Stat::Min)
54✔
295
        .vortex_expect("min should be computed");
54✔
296

297
    let max = array
54✔
298
        .statistics()
54✔
299
        .compute_as::<T>(Stat::Max)
54✔
300
        .vortex_expect("max should be computed");
54✔
301

302
    let typed = TypedStats {
54✔
303
        min,
54✔
304
        max,
54✔
305
        distinct_values: loop_state.distinct_values,
54✔
306
        top_value,
54✔
307
        top_count,
54✔
308
    };
54✔
309

310
    let null_count = null_count
54✔
311
        .try_into()
54✔
312
        .vortex_expect("null_count must fit in u32");
54✔
313
    let value_count = value_count
54✔
314
        .try_into()
54✔
315
        .vortex_expect("value_count must fit in u32");
54✔
316

317
    IntegerStats {
54✔
318
        src: array.clone(),
54✔
319
        null_count,
54✔
320
        value_count,
54✔
321
        average_run_length: value_count / runs,
54✔
322
        distinct_values_count,
54✔
323
        typed: typed.into(),
54✔
324
    }
54✔
325
}
54✔
326

327
struct LoopState<T> {
328
    prev: T,
329
    runs: u32,
330
    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
331
}
332

333
#[inline(always)]
334
fn inner_loop_nonnull<T: NativePType>(
528✔
335
    values: &[T; 64],
528✔
336
    count_distinct_values: bool,
528✔
337
    state: &mut LoopState<T>,
528✔
338
) where
528✔
339
    NativeValue<T>: Eq + Hash,
528✔
340
{
341
    for &value in values {
34,320✔
342
        if count_distinct_values {
33,792✔
343
            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
31,232✔
344
        }
31,232✔
345

346
        if value != state.prev {
33,792✔
347
            state.prev = value;
33,124✔
348
            state.runs += 1;
33,124✔
349
        }
33,124✔
350
    }
351
}
528✔
352

353
#[inline(always)]
UNCOV
354
fn inner_loop_nullable<T: NativePType>(
×
UNCOV
355
    values: &[T; 64],
×
UNCOV
356
    count_distinct_values: bool,
×
UNCOV
357
    is_valid: &BooleanBuffer,
×
UNCOV
358
    state: &mut LoopState<T>,
×
UNCOV
359
) where
×
UNCOV
360
    NativeValue<T>: Eq + Hash,
×
361
{
UNCOV
362
    for (idx, &value) in values.iter().enumerate() {
×
UNCOV
363
        if is_valid.value(idx) {
×
UNCOV
364
            if count_distinct_values {
×
UNCOV
365
                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
×
UNCOV
366
            }
×
367

UNCOV
368
            if value != state.prev {
×
UNCOV
369
                state.prev = value;
×
UNCOV
370
                state.runs += 1;
×
UNCOV
371
            }
×
UNCOV
372
        }
×
373
    }
UNCOV
374
}
×
375

376
#[inline(always)]
377
fn inner_loop_naive<T: NativePType>(
54✔
378
    values: &[T],
54✔
379
    count_distinct_values: bool,
54✔
380
    is_valid: &BooleanBuffer,
54✔
381
    state: &mut LoopState<T>,
54✔
382
) where
54✔
383
    NativeValue<T>: Eq + Hash,
54✔
384
{
385
    for (idx, &value) in values.iter().enumerate() {
478✔
386
        if is_valid.value(idx) {
478✔
387
            if count_distinct_values {
478✔
388
                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
478✔
389
            }
478✔
390

391
            if value != state.prev {
478✔
392
                state.prev = value;
448✔
393
                state.runs += 1;
448✔
394
            }
448✔
UNCOV
395
        }
×
396
    }
397
}
54✔
398

399
#[cfg(test)]
400
mod tests {
401
    use std::iter;
402

403
    use arrow_buffer::BooleanBuffer;
404
    use vortex_array::arrays::PrimitiveArray;
405
    use vortex_array::validity::Validity;
406
    use vortex_buffer::{Buffer, buffer};
407

408
    use crate::CompressorStats;
409
    use crate::integer::IntegerStats;
410
    use crate::integer::stats::typed_int_stats;
411

412
    #[test]
413
    fn test_naive_count_distinct_values() {
414
        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
415
        let stats = typed_int_stats::<u8>(&array, true);
416
        assert_eq!(stats.distinct_values_count, 2);
417
    }
418

419
    #[test]
420
    fn test_naive_count_distinct_values_nullable() {
421
        let array = PrimitiveArray::new(
422
            buffer![217u8, 0],
423
            Validity::from(BooleanBuffer::from(vec![true, false])),
424
        );
425
        let stats = typed_int_stats::<u8>(&array, true);
426
        assert_eq!(stats.distinct_values_count, 1);
427
    }
428

429
    #[test]
430
    fn test_count_distinct_values() {
431
        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
432
        let stats = typed_int_stats::<u8>(&array, true);
433
        assert_eq!(stats.distinct_values_count, 128);
434
    }
435

436
    #[test]
437
    fn test_count_distinct_values_nullable() {
438
        let array = PrimitiveArray::new(
439
            (0..128u8).collect::<Buffer<u8>>(),
440
            Validity::from(BooleanBuffer::from_iter(
441
                iter::repeat_n(vec![true, false], 64).flatten(),
442
            )),
443
        );
444
        let stats = typed_int_stats::<u8>(&array, true);
445
        assert_eq!(stats.distinct_values_count, 64);
446
    }
447

448
    #[test]
449
    fn test_integer_stats_leading_nulls() {
450
        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
451

452
        let stats = IntegerStats::generate(&ints);
453

454
        assert_eq!(stats.value_count, 2);
455
        assert_eq!(stats.null_count, 1);
456
        assert_eq!(stats.average_run_length, 1);
457
        assert_eq!(stats.distinct_values_count, 2);
458
    }
459
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc