16935267080

Committed 13 Aug 2025 11:00AM UTC coverage: 24.312% (-63.3%) from 87.658%

Build # 16935267080

Build Type

Pull #4226

github

Committed by

web-flow

Commit Message

Merge 81b48c7fb into baa6ea202

Pull Request Pull Request #4226: Support converting TimestampTZ to and from duckdb

Run Details

0 of 2 new or added lines in 1 file covered. (0.0%)

20666 existing lines in 469 files now uncovered.

8726 of 35892 relevant lines covered (24.31%)

147.74 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

61.67

/vortex-btrblocks/src/integer/stats.rs

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::hash::Hash;

use arrow_buffer::BooleanBuffer;
use num_traits::PrimInt;
use rustc_hash::FxBuildHasher;
use vortex_array::ToCanonical;
use vortex_array::arrays::{NativeValue, PrimitiveArray, PrimitiveVTable};
use vortex_array::stats::Stat;
use vortex_dtype::{NativePType, match_each_integer_ptype};
use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
use vortex_mask::AllOr;
use vortex_scalar::{PValue, Scalar};
use vortex_utils::aliases::hash_map::HashMap;

use crate::sample::sample;
use crate::{CompressorStats, GenerateStatsOptions};

#[derive(Clone, Debug)]
pub struct TypedStats<T> {
    pub min: T,
    pub max: T,
    pub top_value: T,
    pub top_count: u32,
    pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
}

/// Type-erased container for one of the [TypedStats] variants.
///
/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
/// set of stats. We then perform a variety of access methods on them.
#[derive(Clone, Debug)]
pub enum ErasedStats {
    U8(TypedStats<u8>),
    U16(TypedStats<u16>),
    U32(TypedStats<u32>),
    U64(TypedStats<u64>),
    I8(TypedStats<i8>),
    I16(TypedStats<i16>),
    I32(TypedStats<i32>),
    I64(TypedStats<i64>),
}

impl ErasedStats {
    pub fn min_is_zero(&self) -> bool {
        match &self {
            ErasedStats::U8(x) => x.min == 0,
            ErasedStats::U16(x) => x.min == 0,
            ErasedStats::U32(x) => x.min == 0,
            ErasedStats::U64(x) => x.min == 0,
            ErasedStats::I8(x) => x.min == 0,
            ErasedStats::I16(x) => x.min == 0,
            ErasedStats::I32(x) => x.min == 0,
            ErasedStats::I64(x) => x.min == 0,
        }
    }

    pub fn min_is_negative(&self) -> bool {
        match &self {
            ErasedStats::U8(_)
            | ErasedStats::U16(_)
            | ErasedStats::U32(_)
            | ErasedStats::U64(_) => false,
            ErasedStats::I8(x) => x.min < 0,
            ErasedStats::I16(x) => x.min < 0,
            ErasedStats::I32(x) => x.min < 0,
            ErasedStats::I64(x) => x.min < 0,
        }
    }

    // Difference between max and min.
    pub fn max_minus_min(&self) -> u64 {
        match &self {
            ErasedStats::U8(x) => (x.max - x.min) as u64,
            ErasedStats::U16(x) => (x.max - x.min) as u64,
            ErasedStats::U32(x) => (x.max - x.min) as u64,
            ErasedStats::U64(x) => x.max - x.min,
            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
                .vortex_expect("max minus min result bigger than u64"),
        }
    }

    /// Get the most commonly occurring value and its count
    pub fn top_value_and_count(&self) -> (PValue, u32) {
        match &self {
            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
        }
    }
}

macro_rules! impl_from_typed {
    ($T:ty, $variant:path) => {
        impl From<TypedStats<$T>> for ErasedStats {
            fn from(typed: TypedStats<$T>) -> Self {
                $variant(typed)
            }
        }
    };
}

impl_from_typed!(u8, ErasedStats::U8);
impl_from_typed!(u16, ErasedStats::U16);
impl_from_typed!(u32, ErasedStats::U32);
impl_from_typed!(u64, ErasedStats::U64);
impl_from_typed!(i8, ErasedStats::I8);
impl_from_typed!(i16, ErasedStats::I16);
impl_from_typed!(i32, ErasedStats::I32);
impl_from_typed!(i64, ErasedStats::I64);

#[derive(Clone, Debug)]
pub struct IntegerStats {
    pub(super) src: PrimitiveArray,
    // cache for validity.false_count()
    pub(super) null_count: u32,
    // cache for validity.true_count()
    pub(super) value_count: u32,
    pub(super) average_run_length: u32,
    pub(super) distinct_values_count: u32,
    pub(crate) typed: ErasedStats,
}

impl CompressorStats for IntegerStats {
    type ArrayVTable = PrimitiveVTable;

    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
        match_each_integer_ptype!(input.ptype(), |T| {
            typed_int_stats::<T>(input, opts.count_distinct_values)
        })
    }

    fn source(&self) -> &PrimitiveArray {
        &self.src
    }

    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
        let sampled = sample(self.src.as_ref(), sample_size, sample_count)
            .to_primitive()
            .vortex_expect("primitive");

        Self::generate_opts(&sampled, opts)
    }
}

fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats
where
    T: NativePType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
    TypedStats<T>: Into<ErasedStats>,
    NativeValue<T>: Eq + Hash,
{
    // Special case: empty array
    if array.is_empty() {
        return IntegerStats {
            src: array.clone(),
            null_count: 0,
            value_count: 0,
            average_run_length: 0,
            distinct_values_count: 0,
            typed: TypedStats {
                min: T::max_value(),
                max: T::min_value(),
                top_value: T::default(),
                top_count: 0,
                distinct_values: HashMap::with_hasher(FxBuildHasher),
            }
            .into(),
        };
    } else if array.all_invalid().vortex_expect("all_invalid") {
        return IntegerStats {
            src: array.clone(),
            null_count: array.len().try_into().vortex_expect("null_count"),
            value_count: 0,
            average_run_length: 0,
            distinct_values_count: 0,
            typed: TypedStats {
                min: T::max_value(),
                max: T::min_value(),
                top_value: T::default(),
                top_count: 0,
                distinct_values: HashMap::with_hasher(FxBuildHasher),
            }
            .into(),
        };
    }

    let validity = array.validity_mask().vortex_expect("logical_validity");
    let null_count = validity.false_count();
    let value_count = validity.true_count();

    // Initialize loop state
    let head_idx = validity
        .first()
        .vortex_expect("All null masks have been handled before");
    let buffer = array.buffer::<T>();
    let head = buffer[head_idx];

    let mut loop_state = LoopState {
        distinct_values: if count_distinct_values {
            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
        } else {
            HashMap::with_hasher(FxBuildHasher)
        },
        prev: head,
        runs: 1,
    };

    let sliced = buffer.slice(head_idx..array.len());
    let mut chunks = sliced.as_slice().chunks_exact(64);
    match validity.boolean_buffer() {
        AllOr::All => {
            for chunk in &mut chunks {
                inner_loop_nonnull(
                    chunk.try_into().vortex_unwrap(),
                    count_distinct_values,
                    &mut loop_state,
                )
            }
            let remainder = chunks.remainder();
            inner_loop_naive(
                remainder,
                count_distinct_values,
                &BooleanBuffer::new_set(remainder.len()),
                &mut loop_state,
            );
        }
        AllOr::None => unreachable!("All invalid arrays have been handled before"),
        AllOr::Some(v) => {
            let mask = v.slice(head_idx, array.len() - head_idx);
            let mut offset = 0;
            for chunk in &mut chunks {
                let validity = mask.slice(offset, 64);
                offset += 64;

                match validity.count_set_bits() {
                    // All nulls -> no stats to update
                    0 => continue,
                    // Inner loop for when validity check can be elided
                    64 => inner_loop_nonnull(
                        chunk.try_into().vortex_unwrap(),
                        count_distinct_values,
                        &mut loop_state,
                    ),
                    // Inner loop for when we need to check validity
                    _ => inner_loop_nullable(
                        chunk.try_into().vortex_unwrap(),
                        count_distinct_values,
                        &validity,
                        &mut loop_state,
                    ),
                }
            }
            // Final iteration, run naive loop
            let remainder = chunks.remainder();
            inner_loop_naive(
                remainder,
                count_distinct_values,
                &mask.slice(offset, remainder.len()),
                &mut loop_state,
            );
        }
    }

    let (top_value, top_count) = if count_distinct_values {
        let (&top_value, &top_count) = loop_state
            .distinct_values
            .iter()
            .max_by_key(|&(_, &count)| count)
            .vortex_expect("non-empty");
        (top_value.0, top_count)
    } else {
        (T::default(), 0)
    };

    let runs = loop_state.runs;
    let distinct_values_count = if count_distinct_values {
        loop_state.distinct_values.len().try_into().vortex_unwrap()
    } else {
        u32::MAX
    };

    let min = array
        .statistics()
        .compute_as::<T>(Stat::Min)
        .vortex_expect("min should be computed");

    let max = array
        .statistics()
        .compute_as::<T>(Stat::Max)
        .vortex_expect("max should be computed");

    let typed = TypedStats {
        min,
        max,
        distinct_values: loop_state.distinct_values,
        top_value,
        top_count,
    };

    let null_count = null_count
        .try_into()
        .vortex_expect("null_count must fit in u32");
    let value_count = value_count
        .try_into()
        .vortex_expect("value_count must fit in u32");

    IntegerStats {
        src: array.clone(),
        null_count,
        value_count,
        average_run_length: value_count / runs,
        distinct_values_count,
        typed: typed.into(),
    }
}

struct LoopState<T> {
    prev: T,
    runs: u32,
    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
}

#[inline(always)]
fn inner_loop_nonnull<T: NativePType>(
    values: &[T; 64],
    count_distinct_values: bool,
    state: &mut LoopState<T>,
) where
    NativeValue<T>: Eq + Hash,
{
    for &value in values {
        if count_distinct_values {
            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
        }

        if value != state.prev {
            state.prev = value;
            state.runs += 1;
        }
    }
}

#[inline(always)]
fn inner_loop_nullable<T: NativePType>(
    values: &[T; 64],
    count_distinct_values: bool,
    is_valid: &BooleanBuffer,
    state: &mut LoopState<T>,
) where
    NativeValue<T>: Eq + Hash,
{
    for (idx, &value) in values.iter().enumerate() {
        if is_valid.value(idx) {
            if count_distinct_values {
                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
            }

            if value != state.prev {
                state.prev = value;
                state.runs += 1;
            }
        }
    }
}

#[inline(always)]
fn inner_loop_naive<T: NativePType>(
    values: &[T],
    count_distinct_values: bool,
    is_valid: &BooleanBuffer,
    state: &mut LoopState<T>,
) where
    NativeValue<T>: Eq + Hash,
{
    for (idx, &value) in values.iter().enumerate() {
        if is_valid.value(idx) {
            if count_distinct_values {
                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
            }

            if value != state.prev {
                state.prev = value;
                state.runs += 1;
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use std::iter;

    use arrow_buffer::BooleanBuffer;
    use vortex_array::arrays::PrimitiveArray;
    use vortex_array::validity::Validity;
    use vortex_buffer::{Buffer, buffer};

    use crate::CompressorStats;
    use crate::integer::IntegerStats;
    use crate::integer::stats::typed_int_stats;

    #[test]
    fn test_naive_count_distinct_values() {
        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
        let stats = typed_int_stats::<u8>(&array, true);
        assert_eq!(stats.distinct_values_count, 2);
    }

    #[test]
    fn test_naive_count_distinct_values_nullable() {
        let array = PrimitiveArray::new(
            buffer![217u8, 0],
            Validity::from(BooleanBuffer::from(vec![true, false])),
        );
        let stats = typed_int_stats::<u8>(&array, true);
        assert_eq!(stats.distinct_values_count, 1);
    }

    #[test]
    fn test_count_distinct_values() {
        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
        let stats = typed_int_stats::<u8>(&array, true);
        assert_eq!(stats.distinct_values_count, 128);
    }

    #[test]
    fn test_count_distinct_values_nullable() {
        let array = PrimitiveArray::new(
            (0..128u8).collect::<Buffer<u8>>(),
            Validity::from(BooleanBuffer::from_iter(
                iter::repeat_n(vec![true, false], 64).flatten(),
            )),
        );
        let stats = typed_int_stats::<u8>(&array, true);
        assert_eq!(stats.distinct_values_count, 64);
    }

    #[test]
    fn test_integer_stats_leading_nulls() {
        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));

        let stats = IntegerStats::generate(&ints);

        assert_eq!(stats.value_count, 2);
        assert_eq!(stats.null_count, 1);
        assert_eq!(stats.average_run_length, 1);
        assert_eq!(stats.distinct_values_count, 2);
    }
}

1	// SPDX-License-Identifier: Apache-2.0
2	// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4	use std::hash::Hash;
5
6	use arrow_buffer::BooleanBuffer;
7	use num_traits::PrimInt;
8	use rustc_hash::FxBuildHasher;
9	use vortex_array::ToCanonical;
10	use vortex_array::arrays::{NativeValue, PrimitiveArray, PrimitiveVTable};
11	use vortex_array::stats::Stat;
12	use vortex_dtype::{NativePType, match_each_integer_ptype};
13	use vortex_error::{VortexError, VortexExpect, VortexUnwrap};
14	use vortex_mask::AllOr;
15	use vortex_scalar::{PValue, Scalar};
16	use vortex_utils::aliases::hash_map::HashMap;
17
18	use crate::sample::sample;
19	use crate::{CompressorStats, GenerateStatsOptions};
20
21	#[derive(Clone, Debug)]
22	pub struct TypedStats<T> {
23	pub min: T,
24	pub max: T,
25	pub top_value: T,
26	pub top_count: u32,
27	pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
28	}
29
30	/// Type-erased container for one of the [TypedStats] variants.
31	///
32	/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
33	/// set of stats. We then perform a variety of access methods on them.
34	#[derive(Clone, Debug)]
35	pub enum ErasedStats {
36	U8(TypedStats<u8>),
37	U16(TypedStats<u16>),
38	U32(TypedStats<u32>),
39	U64(TypedStats<u64>),
40	I8(TypedStats<i8>),
41	I16(TypedStats<i16>),
42	I32(TypedStats<i32>),
43	I64(TypedStats<i64>),
44	}
45
46	impl ErasedStats {
47	pub fn min_is_zero(&self) -> bool {	30✔
48	match &self {	30✔
UNCOV 49	ErasedStats::U8(x) => x.min == 0,	×
UNCOV 50	ErasedStats::U16(x) => x.min == 0,	×
UNCOV 51	ErasedStats::U32(x) => x.min == 0,	×
52	ErasedStats::U64(x) => x.min == 0,	6✔
UNCOV 53	ErasedStats::I8(x) => x.min == 0,	×
UNCOV 54	ErasedStats::I16(x) => x.min == 0,	×
55	ErasedStats::I32(x) => x.min == 0,	20✔
56	ErasedStats::I64(x) => x.min == 0,	4✔
57	}
58	}	30✔
59
60	pub fn min_is_negative(&self) -> bool {	60✔
61	match &self {	60✔
62	ErasedStats::U8(_)
63	\| ErasedStats::U16(_)
64	\| ErasedStats::U32(_)
65	\| ErasedStats::U64(_) => false,	12✔
UNCOV 66	ErasedStats::I8(x) => x.min < 0,	×
UNCOV 67	ErasedStats::I16(x) => x.min < 0,	×
68	ErasedStats::I32(x) => x.min < 0,	40✔
69	ErasedStats::I64(x) => x.min < 0,	8✔
70	}
71	}	60✔
72
73	// Difference between max and min.
74	pub fn max_minus_min(&self) -> u64 {	12✔
75	match &self {	12✔
UNCOV 76	ErasedStats::U8(x) => (x.max - x.min) as u64,	×
UNCOV 77	ErasedStats::U16(x) => (x.max - x.min) as u64,	×
UNCOV 78	ErasedStats::U32(x) => (x.max - x.min) as u64,	×
UNCOV 79	ErasedStats::U64(x) => x.max - x.min,	×
UNCOV 80	ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,	×
UNCOV 81	ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,	×
82	ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,	8✔
83	ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)	4✔
84	.vortex_expect("max minus min result bigger than u64"),	4✔
85	}
86	}	12✔
87
88	/// Get the most commonly occurring value and its count
89	pub fn top_value_and_count(&self) -> (PValue, u32) {	30✔
90	match &self {	30✔
UNCOV 91	ErasedStats::U8(x) => (x.top_value.into(), x.top_count),	×
UNCOV 92	ErasedStats::U16(x) => (x.top_value.into(), x.top_count),	×
UNCOV 93	ErasedStats::U32(x) => (x.top_value.into(), x.top_count),	×
94	ErasedStats::U64(x) => (x.top_value.into(), x.top_count),	6✔
UNCOV 95	ErasedStats::I8(x) => (x.top_value.into(), x.top_count),	×
UNCOV 96	ErasedStats::I16(x) => (x.top_value.into(), x.top_count),	×
97	ErasedStats::I32(x) => (x.top_value.into(), x.top_count),	20✔
98	ErasedStats::I64(x) => (x.top_value.into(), x.top_count),	4✔
99	}
100	}	30✔
101	}
102
103	macro_rules! impl_from_typed {
104	($T:ty, $variant:path) => {
105	impl From<TypedStats<$T>> for ErasedStats {
106	fn from(typed: TypedStats<$T>) -> Self {	54✔
107	$variant(typed)	54✔
108	}	54✔
109	}
110	};
111	}
112
113	impl_from_typed!(u8, ErasedStats::U8);
114	impl_from_typed!(u16, ErasedStats::U16);
115	impl_from_typed!(u32, ErasedStats::U32);
116	impl_from_typed!(u64, ErasedStats::U64);
117	impl_from_typed!(i8, ErasedStats::I8);
118	impl_from_typed!(i16, ErasedStats::I16);
119	impl_from_typed!(i32, ErasedStats::I32);
120	impl_from_typed!(i64, ErasedStats::I64);
121
122	#[derive(Clone, Debug)]
123	pub struct IntegerStats {
124	pub(super) src: PrimitiveArray,
125	// cache for validity.false_count()
126	pub(super) null_count: u32,
127	// cache for validity.true_count()
128	pub(super) value_count: u32,
129	pub(super) average_run_length: u32,
130	pub(super) distinct_values_count: u32,
131	pub(crate) typed: ErasedStats,
132	}
133
134	impl CompressorStats for IntegerStats {
135	type ArrayVTable = PrimitiveVTable;
136
137	fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {	54✔
138	match_each_integer_ptype!(input.ptype(), \|T\| {	54✔
UNCOV 139	typed_int_stats::<T>(input, opts.count_distinct_values)	×
140	})
141	}	54✔
142
143	fn source(&self) -> &PrimitiveArray {	220✔
144	&self.src	220✔
145	}	220✔
146
147	fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {	20✔
148	let sampled = sample(self.src.as_ref(), sample_size, sample_count)	20✔
149	.to_primitive()	20✔
150	.vortex_expect("primitive");	20✔
151
152	Self::generate_opts(&sampled, opts)	20✔
153	}	20✔
154	}
155
156	fn typed_int_stats<T>(array: &PrimitiveArray, count_distinct_values: bool) -> IntegerStats	54✔
157	where	54✔
158	T: NativePType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,	54✔
159	TypedStats<T>: Into<ErasedStats>,	54✔
160	NativeValue<T>: Eq + Hash,	54✔
161	{
162	// Special case: empty array
163	if array.is_empty() {	54✔
164	return IntegerStats {	×
165	src: array.clone(),	×
166	null_count: 0,	×
167	value_count: 0,	×
168	average_run_length: 0,	×
169	distinct_values_count: 0,	×
170	typed: TypedStats {	×
171	min: T::max_value(),	×
172	max: T::min_value(),	×
173	top_value: T::default(),	×
174	top_count: 0,	×
175	distinct_values: HashMap::with_hasher(FxBuildHasher),	×
176	}	×
177	.into(),	×
178	};	×
179	} else if array.all_invalid().vortex_expect("all_invalid") {	54✔
180	return IntegerStats {	×
181	src: array.clone(),	×
182	null_count: array.len().try_into().vortex_expect("null_count"),	×
183	value_count: 0,	×
184	average_run_length: 0,	×
185	distinct_values_count: 0,	×
186	typed: TypedStats {	×
187	min: T::max_value(),	×
188	max: T::min_value(),	×
189	top_value: T::default(),	×
190	top_count: 0,	×
191	distinct_values: HashMap::with_hasher(FxBuildHasher),	×
192	}	×
193	.into(),	×
194	};	×
195	}	54✔
196
197	let validity = array.validity_mask().vortex_expect("logical_validity");	54✔
198	let null_count = validity.false_count();	54✔
199	let value_count = validity.true_count();	54✔
200
201	// Initialize loop state
202	let head_idx = validity	54✔
203	.first()	54✔
204	.vortex_expect("All null masks have been handled before");	54✔
205	let buffer = array.buffer::<T>();	54✔
206	let head = buffer[head_idx];	54✔
207
208	let mut loop_state = LoopState {	54✔
209	distinct_values: if count_distinct_values {	54✔
210	HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)	50✔
211	} else {
212	HashMap::with_hasher(FxBuildHasher)	4✔
213	},
214	prev: head,	54✔
215	runs: 1,
216	};
217
218	let sliced = buffer.slice(head_idx..array.len());	54✔
219	let mut chunks = sliced.as_slice().chunks_exact(64);	54✔
220	match validity.boolean_buffer() {	54✔
221	AllOr::All => {
222	for chunk in &mut chunks {	582✔
223	inner_loop_nonnull(	528✔
224	chunk.try_into().vortex_unwrap(),	528✔
225	count_distinct_values,	528✔
226	&mut loop_state,	528✔
227	)
228	}
229	let remainder = chunks.remainder();	54✔
230	inner_loop_naive(	54✔
231	remainder,	54✔
232	count_distinct_values,	54✔
233	&BooleanBuffer::new_set(remainder.len()),	54✔
234	&mut loop_state,	54✔
235	);
236	}
237	AllOr::None => unreachable!("All invalid arrays have been handled before"),	×
UNCOV 238	AllOr::Some(v) => {	×
UNCOV 239	let mask = v.slice(head_idx, array.len() - head_idx);	×
UNCOV 240	let mut offset = 0;	×
UNCOV 241	for chunk in &mut chunks {	×
UNCOV 242	let validity = mask.slice(offset, 64);	×
UNCOV 243	offset += 64;	×
244
UNCOV 245	match validity.count_set_bits() {	×
246	// All nulls -> no stats to update
247	0 => continue,	×
248	// Inner loop for when validity check can be elided
249	64 => inner_loop_nonnull(	×
250	chunk.try_into().vortex_unwrap(),	×
251	count_distinct_values,	×
252	&mut loop_state,	×
253	),
254	// Inner loop for when we need to check validity
UNCOV 255	_ => inner_loop_nullable(	×
UNCOV 256	chunk.try_into().vortex_unwrap(),	×
UNCOV 257	count_distinct_values,	×
UNCOV 258	&validity,	×
UNCOV 259	&mut loop_state,	×
260	),
261	}
262	}
263	// Final iteration, run naive loop
UNCOV 264	let remainder = chunks.remainder();	×
UNCOV 265	inner_loop_naive(	×
UNCOV 266	remainder,	×
UNCOV 267	count_distinct_values,	×
UNCOV 268	&mask.slice(offset, remainder.len()),	×
UNCOV 269	&mut loop_state,	×
270	);
271	}
272	}
273
274	let (top_value, top_count) = if count_distinct_values {	54✔
275	let (&top_value, &top_count) = loop_state	50✔
276	.distinct_values	50✔
277	.iter()	50✔
278	.max_by_key(\|&(_, &count)\| count)	50✔
279	.vortex_expect("non-empty");	50✔
280	(top_value.0, top_count)	50✔
281	} else {
282	(T::default(), 0)	4✔
283	};
284
285	let runs = loop_state.runs;	54✔
286	let distinct_values_count = if count_distinct_values {	54✔
287	loop_state.distinct_values.len().try_into().vortex_unwrap()	50✔
288	} else {
289	u32::MAX	4✔
290	};
291
292	let min = array	54✔
293	.statistics()	54✔
294	.compute_as::<T>(Stat::Min)	54✔
295	.vortex_expect("min should be computed");	54✔
296
297	let max = array	54✔
298	.statistics()	54✔
299	.compute_as::<T>(Stat::Max)	54✔
300	.vortex_expect("max should be computed");	54✔
301
302	let typed = TypedStats {	54✔
303	min,	54✔
304	max,	54✔
305	distinct_values: loop_state.distinct_values,	54✔
306	top_value,	54✔
307	top_count,	54✔
308	};	54✔
309
310	let null_count = null_count	54✔
311	.try_into()	54✔
312	.vortex_expect("null_count must fit in u32");	54✔
313	let value_count = value_count	54✔
314	.try_into()	54✔
315	.vortex_expect("value_count must fit in u32");	54✔
316
317	IntegerStats {	54✔
318	src: array.clone(),	54✔
319	null_count,	54✔
320	value_count,	54✔
321	average_run_length: value_count / runs,	54✔
322	distinct_values_count,	54✔
323	typed: typed.into(),	54✔
324	}	54✔
325	}	54✔
326
327	struct LoopState<T> {
328	prev: T,
329	runs: u32,
330	distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
331	}
332
333	#[inline(always)]
334	fn inner_loop_nonnull<T: NativePType>(	528✔
335	values: &[T; 64],	528✔
336	count_distinct_values: bool,	528✔
337	state: &mut LoopState<T>,	528✔
338	) where	528✔
339	NativeValue<T>: Eq + Hash,	528✔
340	{
341	for &value in values {	34,320✔
342	if count_distinct_values {	33,792✔
343	*state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;	31,232✔
344	}	31,232✔
345
346	if value != state.prev {	33,792✔
347	state.prev = value;	33,124✔
348	state.runs += 1;	33,124✔
349	}	33,124✔
350	}
351	}	528✔
352
353	#[inline(always)]
UNCOV 354	fn inner_loop_nullable<T: NativePType>(	×
UNCOV 355	values: &[T; 64],	×
UNCOV 356	count_distinct_values: bool,	×
UNCOV 357	is_valid: &BooleanBuffer,	×
UNCOV 358	state: &mut LoopState<T>,	×
UNCOV 359	) where	×
UNCOV 360	NativeValue<T>: Eq + Hash,	×
361	{
UNCOV 362	for (idx, &value) in values.iter().enumerate() {	×
UNCOV 363	if is_valid.value(idx) {	×
UNCOV 364	if count_distinct_values {	×
UNCOV 365	*state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;	×
UNCOV 366	}	×
367
UNCOV 368	if value != state.prev {	×
UNCOV 369	state.prev = value;	×
UNCOV 370	state.runs += 1;	×
UNCOV 371	}	×
UNCOV 372	}	×
373	}
UNCOV 374	}	×
375
376	#[inline(always)]
377	fn inner_loop_naive<T: NativePType>(	54✔
378	values: &[T],	54✔
379	count_distinct_values: bool,	54✔
380	is_valid: &BooleanBuffer,	54✔
381	state: &mut LoopState<T>,	54✔
382	) where	54✔
383	NativeValue<T>: Eq + Hash,	54✔
384	{
385	for (idx, &value) in values.iter().enumerate() {	478✔
386	if is_valid.value(idx) {	478✔
387	if count_distinct_values {	478✔
388	*state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;	478✔
389	}	478✔
390
391	if value != state.prev {	478✔
392	state.prev = value;	448✔
393	state.runs += 1;	448✔
394	}	448✔
UNCOV 395	}	×
396	}
397	}	54✔
398
399	#[cfg(test)]
400	mod tests {
401	use std::iter;
402
403	use arrow_buffer::BooleanBuffer;
404	use vortex_array::arrays::PrimitiveArray;
405	use vortex_array::validity::Validity;
406	use vortex_buffer::{Buffer, buffer};
407
408	use crate::CompressorStats;
409	use crate::integer::IntegerStats;
410	use crate::integer::stats::typed_int_stats;
411
412	#[test]
413	fn test_naive_count_distinct_values() {
414	let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
415	let stats = typed_int_stats::<u8>(&array, true);
416	assert_eq!(stats.distinct_values_count, 2);
417	}
418
419	#[test]
420	fn test_naive_count_distinct_values_nullable() {
421	let array = PrimitiveArray::new(
422	buffer![217u8, 0],
423	Validity::from(BooleanBuffer::from(vec![true, false])),
424	);
425	let stats = typed_int_stats::<u8>(&array, true);
426	assert_eq!(stats.distinct_values_count, 1);
427	}
428
429	#[test]
430	fn test_count_distinct_values() {
431	let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
432	let stats = typed_int_stats::<u8>(&array, true);
433	assert_eq!(stats.distinct_values_count, 128);
434	}
435
436	#[test]
437	fn test_count_distinct_values_nullable() {
438	let array = PrimitiveArray::new(
439	(0..128u8).collect::<Buffer<u8>>(),
440	Validity::from(BooleanBuffer::from_iter(
441	iter::repeat_n(vec![true, false], 64).flatten(),
442	)),
443	);
444	let stats = typed_int_stats::<u8>(&array, true);
445	assert_eq!(stats.distinct_values_count, 64);
446	}
447
448	#[test]
449	fn test_integer_stats_leading_nulls() {
450	let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
451
452	let stats = IntegerStats::generate(&ints);
453
454	assert_eq!(stats.value_count, 2);
455	assert_eq!(stats.null_count, 1);
456	assert_eq!(stats.average_run_length, 1);
457	assert_eq!(stats.distinct_values_count, 2);
458	}
459	}

vortex-data / vortex / 16935267080

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous