• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16935267080

13 Aug 2025 11:00AM UTC coverage: 24.312% (-63.3%) from 87.658%
16935267080

Pull #4226

github

web-flow
Merge 81b48c7fb into baa6ea202
Pull Request #4226: Support converting TimestampTZ to and from duckdb

0 of 2 new or added lines in 1 file covered. (0.0%)

20666 existing lines in 469 files now uncovered.

8726 of 35892 relevant lines covered (24.31%)

147.74 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

68.92
/vortex-array/src/stats/mod.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
//! Traits and utilities to compute and access array statistics.
5

6
use std::fmt::{Debug, Display, Formatter};
7
use std::hash::Hash;
8

9
use arrow_buffer::bit_iterator::BitIterator;
10
use arrow_buffer::{BooleanBufferBuilder, MutableBuffer};
11
use enum_iterator::{Sequence, all, last};
12
use log::debug;
13
use num_enum::{IntoPrimitive, TryFromPrimitive};
14
pub use stats_set::*;
15
use vortex_dtype::Nullability::{NonNullable, Nullable};
16
use vortex_dtype::{DType, PType};
17

18
mod array;
19
mod bound;
20
pub mod flatbuffers;
21
mod precision;
22
mod provider;
23
mod stat_bound;
24
mod stats_set;
25

26
pub use array::*;
27
pub use bound::{LowerBound, UpperBound};
28
pub use precision::Precision;
29
pub use provider::*;
30
pub use stat_bound::*;
31
use vortex_error::VortexExpect;
32

33
/// Statistics that are used for pruning files (i.e., we want to ensure they are computed when compressing/writing).
34
/// Sum is included for boolean arrays.
35
pub const PRUNING_STATS: &[Stat] = &[
36
    Stat::Min,
37
    Stat::Max,
38
    Stat::Sum,
39
    Stat::NullCount,
40
    Stat::NaNCount,
41
];
42

43
#[derive(
44
    Debug,
45
    Clone,
46
    Copy,
47
    PartialEq,
48
    Eq,
49
    PartialOrd,
50
    Ord,
51
    Hash,
52
    Sequence,
53
    IntoPrimitive,
54
    TryFromPrimitive,
55
)]
56
#[repr(u8)]
57
pub enum Stat {
58
    /// Whether all values are the same (nulls are not equal to other non-null values,
59
    /// so this is true iff all values are null or all values are the same non-null value)
60
    IsConstant = 0,
61
    /// Whether the non-null values in the array are sorted (i.e., we skip nulls)
62
    IsSorted = 1,
63
    /// Whether the non-null values in the array are strictly sorted (i.e., sorted with no duplicates)
64
    IsStrictSorted = 2,
65
    /// The maximum value in the array (ignoring nulls, unless all values are null)
66
    Max = 3,
67
    /// The minimum value in the array (ignoring nulls, unless all values are null)
68
    Min = 4,
69
    /// The sum of the non-null values of the array.
70
    Sum = 5,
71
    /// The number of null values in the array
72
    NullCount = 6,
73
    /// The uncompressed size of the array in bytes
74
    UncompressedSizeInBytes = 7,
75
    /// The number of NaN values in the array
76
    NaNCount = 8,
77
}
78

79
/// These structs allow the extraction of the bound from the `Precision` value.
80
/// They tie together the Stat and the StatBound, which allows the bound to be extracted.
81
pub struct Max;
82
pub struct Min;
83
pub struct Sum;
84
pub struct IsConstant;
85
pub struct IsSorted;
86
pub struct IsStrictSorted;
87
pub struct NullCount;
88
pub struct UncompressedSizeInBytes;
89
pub struct NaNCount;
90

91
impl StatType<bool> for IsConstant {
92
    type Bound = Precision<bool>;
93

94
    const STAT: Stat = Stat::IsConstant;
95
}
96

97
impl StatType<bool> for IsSorted {
98
    type Bound = Precision<bool>;
99

100
    const STAT: Stat = Stat::IsSorted;
101
}
102

103
impl StatType<bool> for IsStrictSorted {
104
    type Bound = Precision<bool>;
105

106
    const STAT: Stat = Stat::IsStrictSorted;
107
}
108

109
impl<T: PartialOrd + Clone> StatType<T> for NullCount {
110
    type Bound = UpperBound<T>;
111

112
    const STAT: Stat = Stat::NullCount;
113
}
114

115
impl<T: PartialOrd + Clone> StatType<T> for UncompressedSizeInBytes {
116
    type Bound = UpperBound<T>;
117

118
    const STAT: Stat = Stat::UncompressedSizeInBytes;
119
}
120

121
impl<T: PartialOrd + Clone + Debug> StatType<T> for Max {
122
    type Bound = UpperBound<T>;
123

124
    const STAT: Stat = Stat::Max;
125
}
126

127
impl<T: PartialOrd + Clone + Debug> StatType<T> for Min {
128
    type Bound = LowerBound<T>;
129

130
    const STAT: Stat = Stat::Min;
131
}
132

133
impl<T: PartialOrd + Clone + Debug> StatType<T> for Sum {
134
    type Bound = Precision<T>;
135

136
    const STAT: Stat = Stat::Sum;
137
}
138

139
impl<T: PartialOrd + Clone> StatType<T> for NaNCount {
140
    type Bound = UpperBound<T>;
141

142
    const STAT: Stat = Stat::NaNCount;
143
}
144

145
impl Stat {
146
    /// Whether the statistic is commutative (i.e., whether merging can be done independently of ordering)
147
    /// e.g., min/max are commutative, but is_sorted is not
UNCOV
148
    pub fn is_commutative(&self) -> bool {
×
149
        // NOTE: we prefer this syntax to force a compile error if we add a new stat
UNCOV
150
        match self {
×
151
            Self::IsConstant
152
            | Self::Max
153
            | Self::Min
154
            | Self::NullCount
155
            | Self::Sum
156
            | Self::NaNCount
UNCOV
157
            | Self::UncompressedSizeInBytes => true,
×
UNCOV
158
            Self::IsSorted | Self::IsStrictSorted => false,
×
159
        }
UNCOV
160
    }
×
161

162
    /// Whether the statistic has the same dtype as the array it's computed on
UNCOV
163
    pub fn has_same_dtype_as_array(&self) -> bool {
×
UNCOV
164
        matches!(self, Stat::Min | Stat::Max)
×
UNCOV
165
    }
×
166

167
    /// Return the [`DType`] of the statistic scalar assuming the array is of the given [`DType`].
168
    pub fn dtype(&self, data_type: &DType) -> Option<DType> {
408✔
169
        Some(match self {
408✔
UNCOV
170
            Self::IsConstant => DType::Bool(NonNullable),
×
UNCOV
171
            Self::IsSorted => DType::Bool(NonNullable),
×
UNCOV
172
            Self::IsStrictSorted => DType::Bool(NonNullable),
×
173
            Self::Max => data_type.clone(),
94✔
174
            Self::Min => data_type.clone(),
76✔
175
            Self::NullCount => DType::Primitive(PType::U64, NonNullable),
82✔
UNCOV
176
            Self::UncompressedSizeInBytes => DType::Primitive(PType::U64, NonNullable),
×
177
            Self::NaNCount => match data_type {
40✔
178
                DType::Primitive(ptype, ..) if ptype.is_float() => {
40✔
179
                    DType::Primitive(PType::U64, NonNullable)
32✔
180
                }
181
                // Any other type does not support NaN count
182
                _ => return None,
12✔
183
            },
184
            Self::Sum => {
185
                // Any array that cannot be summed has a sum DType of null.
186
                // Any array that can be summed, but overflows, has a sum _value_ of null.
187
                // Therefore, we make integer sum stats nullable.
188
                match data_type {
112✔
UNCOV
189
                    DType::Bool(_) => DType::Primitive(PType::U64, Nullable),
×
190
                    DType::Primitive(ptype, _) => match ptype {
108✔
191
                        PType::U8 | PType::U16 | PType::U32 | PType::U64 => {
192
                            DType::Primitive(PType::U64, Nullable)
12✔
193
                        }
194
                        PType::I8 | PType::I16 | PType::I32 | PType::I64 => {
195
                            DType::Primitive(PType::I64, Nullable)
48✔
196
                        }
197
                        PType::F16 | PType::F32 | PType::F64 => {
198
                            // Float sums cannot overflow, but all null floats still end up as null
199
                            DType::Primitive(PType::F64, Nullable)
48✔
200
                        }
201
                    },
UNCOV
202
                    DType::Extension(ext_dtype) => self.dtype(ext_dtype.storage_dtype())?,
×
203
                    // Unsupported types
204
                    DType::Null
205
                    // TODO(aduffy): implement more stats for Decimal
206
                    | DType::Decimal(..)
207
                    | DType::Utf8(_)
208
                    | DType::Binary(_)
209
                    | DType::Struct(..)
210
                    | DType::List(..) => return None,
4✔
211
                }
212
            }
213
        })
214
    }
408✔
215

216
    pub fn name(&self) -> &str {
110✔
217
        match self {
110✔
218
            Self::IsConstant => "is_constant",
×
219
            Self::IsSorted => "is_sorted",
×
220
            Self::IsStrictSorted => "is_strict_sorted",
×
221
            Self::Max => "max",
24✔
222
            Self::Min => "min",
24✔
223
            Self::NullCount => "null_count",
24✔
224
            Self::UncompressedSizeInBytes => "uncompressed_size_in_bytes",
×
225
            Self::Sum => "sum",
24✔
226
            Self::NaNCount => "nan_count",
14✔
227
        }
228
    }
110✔
229

230
    pub fn all() -> impl Iterator<Item = Stat> {
22✔
231
        all::<Self>()
22✔
232
    }
22✔
233
}
234

235
pub fn as_stat_bitset_bytes(stats: &[Stat]) -> Vec<u8> {
4✔
236
    let max_stat = u8::from(last::<Stat>().vortex_expect("last stat")) as usize + 1;
4✔
237
    // TODO(ngates): use vortex-buffer::BitBuffer
238
    let mut stat_bitset = BooleanBufferBuilder::new_from_buffer(
4✔
239
        MutableBuffer::from_len_zeroed(max_stat.div_ceil(8)),
4✔
240
        max_stat,
4✔
241
    );
242
    for stat in stats {
22✔
243
        stat_bitset.set_bit(u8::from(*stat) as usize, true);
18✔
244
    }
18✔
245

246
    stat_bitset
4✔
247
        .finish()
4✔
248
        .into_inner()
4✔
249
        .into_vec()
4✔
250
        .unwrap_or_else(|b| b.to_vec())
4✔
251
}
4✔
252

253
pub fn stats_from_bitset_bytes(bytes: &[u8]) -> Vec<Stat> {
4✔
254
    BitIterator::new(bytes, 0, bytes.len() * 8)
4✔
255
        .enumerate()
4✔
256
        .filter_map(|(i, b)| b.then_some(i))
64✔
257
        // Filter out indices failing conversion, these are stats written by newer version of library
258
        .filter_map(|i| {
18✔
259
            let Ok(stat) = u8::try_from(i) else {
18✔
260
                debug!("invalid stat encountered: {i}");
×
261
                return None;
×
262
            };
263
            Stat::try_from(stat).ok()
18✔
264
        })
18✔
265
        .collect::<Vec<_>>()
4✔
266
}
4✔
267

268
impl Display for Stat {
269
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
×
270
        write!(f, "{}", self.name())
×
271
    }
×
272
}
273

274
#[cfg(test)]
275
mod test {
276
    use enum_iterator::all;
277

278
    use crate::arrays::PrimitiveArray;
279
    use crate::stats::Stat;
280

281
    #[test]
282
    fn min_of_nulls_is_not_panic() {
283
        let min = PrimitiveArray::from_option_iter::<i32, _>([None, None, None, None])
284
            .statistics()
285
            .compute_as::<i64>(Stat::Min);
286

287
        assert_eq!(min, None);
288
    }
289

290
    #[test]
291
    fn has_same_dtype_as_array() {
292
        assert!(Stat::Min.has_same_dtype_as_array());
293
        assert!(Stat::Max.has_same_dtype_as_array());
294
        for stat in all::<Stat>().filter(|s| !matches!(s, Stat::Min | Stat::Max)) {
295
            assert!(!stat.has_same_dtype_as_array());
296
        }
297
    }
298
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc