• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 17073077835

19 Aug 2025 02:40PM UTC coverage: 24.083%. First build
17073077835

Pull #4177

github

web-flow
Merge b42e5758f into 431a8f2b5
Pull Request #4177: feat: ArrayOperations infallible, eager validation + new_unchecked

197 of 1455 new or added lines in 154 files covered. (13.54%)

8646 of 35901 relevant lines covered (24.08%)

142.28 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/vortex-btrblocks/src/integer/dictionary.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
//! Dictionary compressor that reuses the unique values in the `IntegerStats`.
5

6
use vortex_array::IntoArray;
7
use vortex_array::arrays::PrimitiveArray;
8
use vortex_array::validity::Validity;
9
use vortex_array::vtable::ValidityHelper;
10
use vortex_buffer::Buffer;
11
use vortex_dict::DictArray;
12

13
use crate::integer::IntegerStats;
14
use crate::integer::stats::ErasedStats;
15

16
macro_rules! typed_encode {
17
    ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{
18
        let values: Buffer<$typ> = $typed.distinct_values.keys().map(|x| x.0).collect();
19

20
        let max_code = values.len();
21
        let codes = if max_code <= u8::MAX as usize {
22
            let buf =
23
                <DictEncoder as Encode<$typ, u8>>::encode(&values, $stats.src.as_slice::<$typ>());
24
            PrimitiveArray::new(buf, $validity.clone()).into_array()
25
        } else if max_code <= u16::MAX as usize {
26
            let buf =
27
                <DictEncoder as Encode<$typ, u16>>::encode(&values, $stats.src.as_slice::<$typ>());
28
            PrimitiveArray::new(buf, $validity.clone()).into_array()
29
        } else {
30
            let buf =
31
                <DictEncoder as Encode<$typ, u32>>::encode(&values, $stats.src.as_slice::<$typ>());
32
            PrimitiveArray::new(buf, $validity.clone()).into_array()
33
        };
34

35
        let values_validity = match $validity {
36
            Validity::NonNullable => Validity::NonNullable,
37
            _ => Validity::AllValid,
38
        };
39

40
        let values = PrimitiveArray::new(values, values_validity).into_array();
41
        // SAFETY: invariants enforced in DictEncoder
42
        unsafe { DictArray::new_unchecked(codes, values) }
43
    }};
44
}
45

46
#[allow(clippy::cognitive_complexity)]
NEW
47
pub fn dictionary_encode(stats: &IntegerStats) -> DictArray {
×
48
    // We need to preserve the nullability somehow from the original
49
    let src_validity = stats.src.validity();
×
50

51
    match &stats.typed {
×
52
        ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8),
×
53
        ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16),
×
54
        ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32),
×
55
        ErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64),
×
56
        ErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8),
×
57
        ErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16),
×
58
        ErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32),
×
59
        ErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64),
×
60
    }
61
}
×
62

63
struct DictEncoder;
64

65
trait Encode<T, I> {
66
    /// Using the distinct value set, turn the values into a set of codes.
67
    fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
68
}
69

70
macro_rules! impl_encode {
71
    ($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
72
    ($typ:ty, $($ityp:ty),+) => {
73
        $(
74
        impl Encode<$typ, $ityp> for DictEncoder {
75
            #[allow(clippy::cast_possible_truncation)]
76
            fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
×
77
                let mut codes =
×
78
                    vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
×
79
                        distinct.len(),
×
80
                    );
81
                for (code, &value) in distinct.iter().enumerate() {
×
82
                    codes.insert(value, code as $ityp);
×
83
                }
×
84

85
                let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
×
86
                for value in values {
×
87
                    // Any code lookups which fail are for nulls, so their value
×
88
                    // does not matter.
×
89
                    // SAFETY: we have exactly sized output to be as large as values.
×
90
                    unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
×
91
                }
×
92

93
                return output.freeze();
×
94
            }
×
95
        }
96
        )*
97
    };
98
}
99

100
impl_encode!(u8);
101
impl_encode!(u16);
102
impl_encode!(u32);
103
impl_encode!(u64);
104
impl_encode!(i8);
105
impl_encode!(i16);
106
impl_encode!(i32);
107
impl_encode!(i64);
108

109
#[cfg(test)]
110
mod tests {
111
    use vortex_array::arrays::{BoolArray, PrimitiveArray};
112
    use vortex_array::validity::Validity;
113
    use vortex_array::{Array, IntoArray, ToCanonical};
114
    use vortex_buffer::buffer;
115

116
    use crate::CompressorStats;
117
    use crate::integer::IntegerStats;
118
    use crate::integer::dictionary::dictionary_encode;
119

120
    #[test]
121
    fn test_dict_encode_integer_stats() {
122
        // Create an array that has some nulls
123
        let data = buffer![100i32, 200, 100, 0, 100];
124
        let validity =
125
            Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
126
        let array = PrimitiveArray::new(data, validity);
127

128
        let stats = IntegerStats::generate(&array);
129
        let dict_array = dictionary_encode(&stats);
130
        assert_eq!(dict_array.values().len(), 2);
131
        assert_eq!(dict_array.codes().len(), 5);
132

133
        let undict = dict_array.to_primitive().unwrap();
134

135
        // We just use code zero, but it doesn't really matter.
136
        // We can just shove a whole validity buffer in there instead.
137
        assert_eq!(undict.as_slice::<i32>(), &[100i32, 200, 100, 100, 100]);
138
    }
139
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc