• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16497799136

24 Jul 2025 01:10PM UTC coverage: 81.12% (+0.03%) from 81.091%
16497799136

Pull #4002

github

web-flow
Merge d12e93d9f into 3e5a1339f
Pull Request #4002: fix: Implement compact for ListArray

104 of 114 new or added lines in 4 files covered. (91.23%)

22 existing lines in 3 files now uncovered.

42189 of 52008 relevant lines covered (81.12%)

173445.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.12
/vortex-array/src/arrays/varbinview/compact.rs
1
//  SPDX-License-Identifier: Apache-2.0
2
//  SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
//! Defines a compaction operation for VarBinViewArrays that evicts unused buffers so they can
5
//! be dropped.
6

7
use vortex_error::{VortexResult, VortexUnwrap};
8

9
use crate::arrays::VarBinViewArray;
10
use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11
use crate::validity::Validity;
12
use crate::vtable::ValidityHelper;
13

14
impl VarBinViewArray {
15
    /// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This
16
    /// operation can be very expensive, in the worst cast copying all existing string data into
17
    /// a new allocation.
18
    ///
19
    /// After slicing/taking operations `VarBinViewArray`s can continue to hold references to buffers
20
    /// that are no longer visible. We detect when there is wasted space in any of the buffers, and if
21
    /// so, will aggressively compact all visile outlined string data into a single new buffer.
22
    pub fn compact_buffers(&self) -> VortexResult<VarBinViewArray> {
3,508✔
23
        // If there is nothing to be gained by compaction, return the original array untouched.
24
        if !self.should_compact() {
3,508✔
25
            return Ok(self.clone());
2,301✔
26
        }
1,207✔
27

28
        // Compaction pathways, depend on the validity
29
        match self.validity() {
1,207✔
30
            // The array contains no values, all buffers can be dropped.
NEW
31
            Validity::AllInvalid => Ok(VarBinViewArray::try_new(
×
NEW
32
                self.views().clone(),
×
NEW
33
                vec![],
×
NEW
34
                self.dtype().clone(),
×
NEW
35
                self.validity().clone(),
×
NEW
36
            )?),
×
37
            // Non-null pathway
38
            Validity::NonNullable | Validity::AllValid => rebuild_nonnull(self),
874✔
39
            // Nullable pathway, requires null-checks for each value
40
            Validity::Array(_) => rebuild_nullable(self),
333✔
41
        }
42
    }
3,508✔
43

44
    fn should_compact(&self) -> bool {
3,508✔
45
        // If the array is entirely inlined strings, do not attempt to compact.
46
        if self.nbuffers() == 0 {
3,508✔
47
            return false;
1,837✔
48
        }
1,671✔
49

50
        let bytes_referenced: u64 = self.count_referenced_bytes();
1,671✔
51
        let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
5,119✔
52

53
        // If there is any wasted space, we want to repack.
54
        // This is very aggressive.
55
        bytes_referenced < buffer_total_bytes
1,671✔
56
    }
3,508✔
57

58
    // count the number of bytes addressed by the views, not including null
59
    // values or any inlined strings.
60
    fn count_referenced_bytes(&self) -> u64 {
1,671✔
61
        match self.validity() {
1,671✔
NEW
62
            Validity::AllInvalid => 0u64,
×
63
            _ => self
1,671✔
64
                .views()
1,671✔
65
                .iter()
1,671✔
66
                .enumerate()
1,671✔
67
                .map(|(idx, &view)| {
2,416,420✔
68
                    if !self.is_valid(idx).vortex_unwrap() || view.is_inlined() {
2,416,420✔
69
                        0u64
166,856✔
70
                    } else {
71
                        view.len() as u64
2,249,564✔
72
                    }
73
                })
2,416,420✔
74
                .sum(),
1,671✔
75
        }
76
    }
1,671✔
77
}
78

79
// Nullable string array compaction pathway.
80
// This requires a null check on every append.
81
fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
333✔
82
    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
333✔
83
    for i in 0..array.len() {
1,887✔
84
        if !array.is_valid(i)? {
1,887✔
85
            builder.append_null();
703✔
86
        } else {
1,184✔
87
            let bytes = array.bytes_at(i);
1,184✔
88
            builder.append_value(bytes.as_slice());
1,184✔
89
        }
1,184✔
90
    }
91

92
    Ok(builder.finish_into_varbinview())
333✔
93
}
333✔
94

95
// Compaction for string arrays that contain no null values. Saves a branch
96
// for every string element.
97
fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
874✔
98
    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
874✔
99
    for i in 0..array.len() {
2,295,778✔
100
        builder.append_value(array.bytes_at(i).as_ref());
2,295,778✔
101
    }
2,295,778✔
102
    Ok(builder.finish_into_varbinview())
874✔
103
}
874✔
104

105
#[cfg(test)]
106
mod tests {
107
    use vortex_buffer::buffer;
108

109
    use crate::IntoArray;
110
    use crate::arrays::{VarBinViewArray, VarBinViewVTable};
111
    use crate::compute::take;
112

113
    #[test]
114
    fn test_optimize_compacts_buffers() {
1✔
115
        // Create a VarBinViewArray with some long strings that will create multiple buffers
116
        let original = VarBinViewArray::from_iter_nullable_str([
1✔
117
            Some("short"),
1✔
118
            Some("this is a longer string that will be stored in a buffer"),
1✔
119
            Some("medium length string"),
1✔
120
            Some("another very long string that definitely needs a buffer to store it"),
1✔
121
            Some("tiny"),
1✔
122
        ]);
1✔
123

124
        // Verify it has buffers
125
        assert!(original.nbuffers() > 0);
1✔
126
        let original_buffers = original.nbuffers();
1✔
127

128
        // Take only the first and last elements (indices 0 and 4)
129
        let indices = buffer![0u32, 4u32].into_array();
1✔
130
        let taken = take(original.as_ref(), &indices).unwrap();
1✔
131
        let taken_array = taken.as_::<VarBinViewVTable>();
1✔
132

133
        // The taken array should still have the same number of buffers
134
        assert_eq!(taken_array.nbuffers(), original_buffers);
1✔
135

136
        // Now optimize the taken array
137
        let optimized_array = taken_array.compact_buffers().unwrap();
1✔
138

139
        // The optimized array should have compacted buffers
140
        // Since both remaining strings are short, they should be inlined
141
        // so we might have 0 buffers, or 1 buffer if any were not inlined
142
        assert!(optimized_array.nbuffers() <= 1);
1✔
143

144
        // Verify the data is still correct
145
        assert_eq!(optimized_array.len(), 2);
1✔
146
        assert_eq!(optimized_array.scalar_at(0).unwrap(), "short".into());
1✔
147
        assert_eq!(optimized_array.scalar_at(1).unwrap(), "tiny".into());
1✔
148
    }
1✔
149

150
    #[test]
151
    fn test_optimize_with_long_strings() {
1✔
152
        // Create strings that are definitely longer than 12 bytes
153
        let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
1✔
154
        let long_string_2 = "another extremely long string that also needs external buffer storage";
1✔
155
        let long_string_3 = "yet another long string for testing buffer compaction functionality";
1✔
156

157
        let original = VarBinViewArray::from_iter_str([
1✔
158
            long_string_1,
1✔
159
            long_string_2,
1✔
160
            long_string_3,
1✔
161
            "short1",
1✔
162
            "short2",
1✔
163
        ]);
1✔
164

165
        // Take only the first and third long strings (indices 0 and 2)
166
        let indices = buffer![0u32, 2u32].into_array();
1✔
167
        let taken = take(original.as_ref(), &indices).unwrap();
1✔
168
        let taken_array = taken.as_::<VarBinViewVTable>();
1✔
169

170
        // Optimize the taken array
171
        let optimized_array = taken_array.compact_buffers().unwrap();
1✔
172

173
        // The optimized array should have exactly 1 buffer (consolidated)
174
        assert_eq!(optimized_array.nbuffers(), 1);
1✔
175

176
        // Verify the data is still correct
177
        assert_eq!(optimized_array.len(), 2);
1✔
178
        assert_eq!(optimized_array.scalar_at(0).unwrap(), long_string_1.into());
1✔
179
        assert_eq!(optimized_array.scalar_at(1).unwrap(), long_string_3.into());
1✔
180
    }
1✔
181

182
    #[test]
183
    fn test_optimize_no_buffers() {
1✔
184
        // Create an array with only short strings (all inlined)
185
        let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
1✔
186

187
        // This should have no buffers
188
        assert_eq!(original.nbuffers(), 0);
1✔
189

190
        // Optimize should return the same array
191
        let optimized_array = original.compact_buffers().unwrap();
1✔
192

193
        assert_eq!(optimized_array.nbuffers(), 0);
1✔
194
        assert_eq!(optimized_array.len(), 4);
1✔
195

196
        // Verify all values are preserved
197
        for i in 0..4 {
5✔
198
            assert_eq!(
4✔
199
                optimized_array.scalar_at(i).unwrap(),
4✔
200
                original.scalar_at(i).unwrap()
4✔
201
            );
202
        }
203
    }
1✔
204

205
    #[test]
206
    fn test_optimize_single_buffer() {
1✔
207
        // Create an array that naturally has only one buffer
208
        let str1 = "this is a long string that goes into a buffer";
1✔
209
        let str2 = "another long string in the same buffer";
1✔
210
        let original = VarBinViewArray::from_iter_str([str1, str2]);
1✔
211

212
        // Should have 1 compact buffer
213
        assert_eq!(original.nbuffers(), 1);
1✔
214
        assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
1✔
215

216
        // Optimize should return the same array (no change needed)
217
        let optimized_array = original.compact_buffers().unwrap();
1✔
218

219
        assert_eq!(optimized_array.nbuffers(), 1);
1✔
220
        assert_eq!(optimized_array.len(), 2);
1✔
221

222
        // Verify all values are preserved
223
        for i in 0..2 {
3✔
224
            assert_eq!(
2✔
225
                optimized_array.scalar_at(i).unwrap(),
2✔
226
                original.scalar_at(i).unwrap()
2✔
227
            );
228
        }
229
    }
1✔
230
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc