• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 17042426005

18 Aug 2025 01:44PM UTC coverage: 87.995%. First build
17042426005

Pull #4216

github

web-flow
Merge 52d72e70b into c0b668f7f
Pull Request #4216: feat: better and more consistent validation in SerdeVTable::build

525 of 671 new or added lines in 80 files covered. (78.24%)

56705 of 64441 relevant lines covered (88.0%)

627615.79 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.91
/vortex-array/src/arrays/varbinview/compact.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
//! Defines a compaction operation for VarBinViewArrays that evicts unused buffers so they can
5
//! be dropped.
6

7
use vortex_error::{VortexResult, VortexUnwrap};
8

9
use crate::arrays::VarBinViewArray;
10
use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11
use crate::validity::Validity;
12
use crate::vtable::ValidityHelper;
13

14
impl VarBinViewArray {
15
    /// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This
16
    /// operation can be very expensive, in the worst cast copying all existing string data into
17
    /// a new allocation.
18
    ///
19
    /// After slicing/taking operations `VarBinViewArray`s can continue to hold references to buffers
20
    /// that are no longer visible. We detect when there is wasted space in any of the buffers, and if
21
    /// so, will aggressively compact all visile outlined string data into a single new buffer.
22
    pub fn compact_buffers(&self) -> VortexResult<VarBinViewArray> {
4,066✔
23
        // If there is nothing to be gained by compaction, return the original array untouched.
24
        if !self.should_compact() {
4,066✔
25
            return Ok(self.clone());
2,618✔
26
        }
1,448✔
27

28
        // Compaction pathways, depend on the validity
29
        match self.validity() {
1,448✔
30
            // The array contains no values, all buffers can be dropped.
31
            // SAFETY: for all-invalid array, zeroed views and buffer because they are never accessed.
32
            Validity::AllInvalid => unsafe {
NEW
33
                Ok(VarBinViewArray::new_unchecked(
×
NEW
34
                    self.views().clone(),
×
NEW
35
                    Default::default(),
×
NEW
36
                    self.dtype().clone(),
×
NEW
37
                    self.validity().clone(),
×
NEW
38
                ))
×
39
            },
40
            // Non-null pathway
41
            Validity::NonNullable | Validity::AllValid => rebuild_nonnull(self),
1,097✔
42
            // Nullable pathway, requires null-checks for each value
43
            Validity::Array(_) => rebuild_nullable(self),
351✔
44
        }
45
    }
4,066✔
46

47
    fn should_compact(&self) -> bool {
4,066✔
48
        // If the array is entirely inlined strings, do not attempt to compact.
49
        if self.nbuffers() == 0 {
4,066✔
50
            return false;
2,041✔
51
        }
2,025✔
52

53
        let bytes_referenced: u64 = self.count_referenced_bytes();
2,025✔
54
        let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
7,005✔
55

56
        // If there is any wasted space, we want to repack.
57
        // This is very aggressive.
58
        bytes_referenced < buffer_total_bytes
2,025✔
59
    }
4,066✔
60

61
    // count the number of bytes addressed by the views, not including null
62
    // values or any inlined strings.
63
    fn count_referenced_bytes(&self) -> u64 {
2,025✔
64
        match self.validity() {
2,025✔
65
            Validity::AllInvalid => 0u64,
×
66
            _ => self
2,025✔
67
                .views()
2,025✔
68
                .iter()
2,025✔
69
                .enumerate()
2,025✔
70
                .map(|(idx, &view)| {
3,621,333✔
71
                    if !self.is_valid(idx).vortex_unwrap() || view.is_inlined() {
3,621,333✔
72
                        0u64
246,989✔
73
                    } else {
74
                        view.len() as u64
3,374,344✔
75
                    }
76
                })
3,621,333✔
77
                .sum(),
2,025✔
78
        }
79
    }
2,025✔
80
}
81

82
// Nullable string array compaction pathway.
83
// This requires a null check on every append.
84
fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
351✔
85
    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
351✔
86
    for i in 0..array.len() {
1,989✔
87
        if !array.is_valid(i)? {
1,989✔
88
            builder.append_null();
741✔
89
        } else {
1,248✔
90
            let bytes = array.bytes_at(i);
1,248✔
91
            builder.append_value(bytes.as_slice());
1,248✔
92
        }
1,248✔
93
    }
94

95
    Ok(builder.finish_into_varbinview())
351✔
96
}
351✔
97

98
// Compaction for string arrays that contain no null values. Saves a branch
99
// for every string element.
100
fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
1,097✔
101
    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
1,097✔
102
    for i in 0..array.len() {
3,591,634✔
103
        builder.append_value(array.bytes_at(i).as_ref());
3,591,634✔
104
    }
3,591,634✔
105
    Ok(builder.finish_into_varbinview())
1,097✔
106
}
1,097✔
107

108
#[cfg(test)]
109
mod tests {
110
    use vortex_buffer::buffer;
111

112
    use crate::IntoArray;
113
    use crate::arrays::{VarBinViewArray, VarBinViewVTable};
114
    use crate::compute::take;
115

116
    #[test]
117
    fn test_optimize_compacts_buffers() {
1✔
118
        // Create a VarBinViewArray with some long strings that will create multiple buffers
119
        let original = VarBinViewArray::from_iter_nullable_str([
1✔
120
            Some("short"),
1✔
121
            Some("this is a longer string that will be stored in a buffer"),
1✔
122
            Some("medium length string"),
1✔
123
            Some("another very long string that definitely needs a buffer to store it"),
1✔
124
            Some("tiny"),
1✔
125
        ]);
1✔
126

127
        // Verify it has buffers
128
        assert!(original.nbuffers() > 0);
1✔
129
        let original_buffers = original.nbuffers();
1✔
130

131
        // Take only the first and last elements (indices 0 and 4)
132
        let indices = buffer![0u32, 4u32].into_array();
1✔
133
        let taken = take(original.as_ref(), &indices).unwrap();
1✔
134
        let taken_array = taken.as_::<VarBinViewVTable>();
1✔
135

136
        // The taken array should still have the same number of buffers
137
        assert_eq!(taken_array.nbuffers(), original_buffers);
1✔
138

139
        // Now optimize the taken array
140
        let optimized_array = taken_array.compact_buffers().unwrap();
1✔
141

142
        // The optimized array should have compacted buffers
143
        // Since both remaining strings are short, they should be inlined
144
        // so we might have 0 buffers, or 1 buffer if any were not inlined
145
        assert!(optimized_array.nbuffers() <= 1);
1✔
146

147
        // Verify the data is still correct
148
        assert_eq!(optimized_array.len(), 2);
1✔
149
        assert_eq!(optimized_array.scalar_at(0), "short".into());
1✔
150
        assert_eq!(optimized_array.scalar_at(1), "tiny".into());
1✔
151
    }
1✔
152

153
    #[test]
154
    fn test_optimize_with_long_strings() {
1✔
155
        // Create strings that are definitely longer than 12 bytes
156
        let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
1✔
157
        let long_string_2 = "another extremely long string that also needs external buffer storage";
1✔
158
        let long_string_3 = "yet another long string for testing buffer compaction functionality";
1✔
159

160
        let original = VarBinViewArray::from_iter_str([
1✔
161
            long_string_1,
1✔
162
            long_string_2,
1✔
163
            long_string_3,
1✔
164
            "short1",
1✔
165
            "short2",
1✔
166
        ]);
1✔
167

168
        // Take only the first and third long strings (indices 0 and 2)
169
        let indices = buffer![0u32, 2u32].into_array();
1✔
170
        let taken = take(original.as_ref(), &indices).unwrap();
1✔
171
        let taken_array = taken.as_::<VarBinViewVTable>();
1✔
172

173
        // Optimize the taken array
174
        let optimized_array = taken_array.compact_buffers().unwrap();
1✔
175

176
        // The optimized array should have exactly 1 buffer (consolidated)
177
        assert_eq!(optimized_array.nbuffers(), 1);
1✔
178

179
        // Verify the data is still correct
180
        assert_eq!(optimized_array.len(), 2);
1✔
181
        assert_eq!(optimized_array.scalar_at(0), long_string_1.into());
1✔
182
        assert_eq!(optimized_array.scalar_at(1), long_string_3.into());
1✔
183
    }
1✔
184

185
    #[test]
186
    fn test_optimize_no_buffers() {
1✔
187
        // Create an array with only short strings (all inlined)
188
        let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
1✔
189

190
        // This should have no buffers
191
        assert_eq!(original.nbuffers(), 0);
1✔
192

193
        // Optimize should return the same array
194
        let optimized_array = original.compact_buffers().unwrap();
1✔
195

196
        assert_eq!(optimized_array.nbuffers(), 0);
1✔
197
        assert_eq!(optimized_array.len(), 4);
1✔
198

199
        // Verify all values are preserved
200
        for i in 0..4 {
5✔
201
            assert_eq!(optimized_array.scalar_at(i), original.scalar_at(i));
4✔
202
        }
203
    }
1✔
204

205
    #[test]
206
    fn test_optimize_single_buffer() {
1✔
207
        // Create an array that naturally has only one buffer
208
        let str1 = "this is a long string that goes into a buffer";
1✔
209
        let str2 = "another long string in the same buffer";
1✔
210
        let original = VarBinViewArray::from_iter_str([str1, str2]);
1✔
211

212
        // Should have 1 compact buffer
213
        assert_eq!(original.nbuffers(), 1);
1✔
214
        assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
1✔
215

216
        // Optimize should return the same array (no change needed)
217
        let optimized_array = original.compact_buffers().unwrap();
1✔
218

219
        assert_eq!(optimized_array.nbuffers(), 1);
1✔
220
        assert_eq!(optimized_array.len(), 2);
1✔
221

222
        // Verify all values are preserved
223
        for i in 0..2 {
3✔
224
            assert_eq!(optimized_array.scalar_at(i), original.scalar_at(i));
2✔
225
        }
226
    }
1✔
227
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc