• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16566687104

28 Jul 2025 10:31AM UTC coverage: 81.78%. Remained the same
16566687104

Pull #4035

github

web-flow
Merge 53789e31b into a0efcbe7a
Pull Request #4035: use arc slice for buffers in varbinview

22 of 23 new or added lines in 14 files covered. (95.65%)

90 existing lines in 8 files now uncovered.

43220 of 52849 relevant lines covered (81.78%)

170701.56 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

94.12
/vortex-array/src/arrays/varbinview/compact.rs
1
//  SPDX-License-Identifier: Apache-2.0
2
//  SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
//! Defines a compaction operation for VarBinViewArrays that evicts unused buffers so they can
5
//! be dropped.
6

7
use vortex_error::{VortexResult, VortexUnwrap};
8

9
use crate::arrays::VarBinViewArray;
10
use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11
use crate::validity::Validity;
12
use crate::vtable::ValidityHelper;
13

14
/// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This
15
/// operation can be very expensive, in the worst cast copying all existing string data into
16
/// a new allocation.
17
///
18
/// After slicing/taking operations `VarBinViewArray`s can continue to hold references to buffers
19
/// that are no longer visible. We detect when there is wasted space in any of the buffers, and if
20
/// so, will aggressively compact all visile outlined string data into a single new buffer.
21
pub fn compact_buffers(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
3,508✔
22
    // If there is nothing to be gained by compaction, return the original array untouched.
23
    if !should_compact(array) {
3,508✔
24
        return Ok(array.clone());
2,301✔
25
    }
1,207✔
26

27
    // Compaction pathways, depend on the validity
28
    match array.validity() {
1,207✔
29
        // The array contains no values, all buffers can be dropped.
30
        Validity::AllInvalid => Ok(VarBinViewArray::try_new(
×
31
            array.views().clone(),
×
NEW
32
            vec![],
×
33
            array.dtype().clone(),
×
34
            array.validity().clone(),
×
35
        )?),
×
36
        // Non-null pathway
37
        Validity::NonNullable | Validity::AllValid => rebuild_nonnull(array),
874✔
38
        // Nullable pathway, requires null-checks for each value
39
        Validity::Array(_) => rebuild_nullable(array),
333✔
40
    }
41
}
3,508✔
42

43
fn should_compact(array: &VarBinViewArray) -> bool {
3,508✔
44
    // If the array is entirely inlined strings, do not attempt to compact.
45
    if array.nbuffers() == 0 {
3,508✔
46
        return false;
1,837✔
47
    }
1,671✔
48

49
    let bytes_referenced: u64 = count_referenced_bytes(array);
1,671✔
50
    let buffer_total_bytes: u64 = array.buffers.iter().map(|buf| buf.len() as u64).sum();
5,119✔
51

52
    // If there is any wasted space, we want to repack.
53
    // This is very aggressive.
54
    bytes_referenced < buffer_total_bytes
1,671✔
55
}
3,508✔
56

57
// count the number of bytes addressed by the views, not including null
58
// values or any inlined strings.
59
fn count_referenced_bytes(array: &VarBinViewArray) -> u64 {
1,671✔
60
    match array.validity() {
1,671✔
61
        Validity::AllInvalid => 0u64,
×
62
        _ => {
63
            array
1,671✔
64
                .views()
1,671✔
65
                .iter()
1,671✔
66
                .enumerate()
1,671✔
67
                .map(|(idx, &view)| {
2,416,420✔
68
                    if !array.is_valid(idx).vortex_unwrap() || view.is_inlined() {
2,416,420✔
69
                        0u64
166,856✔
70
                    } else {
71
                        // SAFETY: in this branch the view is not inlined.
72
                        unsafe { view._ref }.size as u64
2,249,564✔
73
                    }
74
                })
2,416,420✔
75
                .sum()
1,671✔
76
        }
77
    }
78
}
1,671✔
79

80
// Nullable string array compaction pathway.
81
// This requires a null check on every append.
82
fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
333✔
83
    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
333✔
84
    for i in 0..array.len() {
1,887✔
85
        if !array.is_valid(i)? {
1,887✔
86
            builder.append_null();
703✔
87
        } else {
1,184✔
88
            let bytes = array.bytes_at(i);
1,184✔
89
            builder.append_value(bytes.as_slice());
1,184✔
90
        }
1,184✔
91
    }
92

93
    Ok(builder.finish_into_varbinview())
333✔
94
}
333✔
95

96
// Compaction for string arrays that contain no null values. Saves a branch
97
// for every string element.
98
fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
874✔
99
    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
874✔
100
    for i in 0..array.len() {
2,295,778✔
101
        builder.append_value(array.bytes_at(i).as_ref());
2,295,778✔
102
    }
2,295,778✔
103
    Ok(builder.finish_into_varbinview())
874✔
104
}
874✔
105

106
#[cfg(test)]
107
mod tests {
108
    use vortex_buffer::buffer;
109

110
    use crate::IntoArray;
111
    use crate::arrays::varbinview::compact::compact_buffers;
112
    use crate::arrays::{VarBinViewArray, VarBinViewVTable};
113
    use crate::compute::take;
114

115
    #[test]
116
    fn test_optimize_compacts_buffers() {
1✔
117
        // Create a VarBinViewArray with some long strings that will create multiple buffers
118
        let original = VarBinViewArray::from_iter_nullable_str([
1✔
119
            Some("short"),
1✔
120
            Some("this is a longer string that will be stored in a buffer"),
1✔
121
            Some("medium length string"),
1✔
122
            Some("another very long string that definitely needs a buffer to store it"),
1✔
123
            Some("tiny"),
1✔
124
        ]);
1✔
125

126
        // Verify it has buffers
127
        assert!(original.nbuffers() > 0);
1✔
128
        let original_buffers = original.nbuffers();
1✔
129

130
        // Take only the first and last elements (indices 0 and 4)
131
        let indices = buffer![0u32, 4u32].into_array();
1✔
132
        let taken = take(original.as_ref(), &indices).unwrap();
1✔
133
        let taken_array = taken.as_::<VarBinViewVTable>();
1✔
134

135
        // The taken array should still have the same number of buffers
136
        assert_eq!(taken_array.nbuffers(), original_buffers);
1✔
137

138
        // Now optimize the taken array
139
        let optimized_array = compact_buffers(taken_array).unwrap();
1✔
140

141
        // The optimized array should have compacted buffers
142
        // Since both remaining strings are short, they should be inlined
143
        // so we might have 0 buffers, or 1 buffer if any were not inlined
144
        assert!(optimized_array.nbuffers() <= 1);
1✔
145

146
        // Verify the data is still correct
147
        assert_eq!(optimized_array.len(), 2);
1✔
148
        assert_eq!(optimized_array.scalar_at(0).unwrap(), "short".into());
1✔
149
        assert_eq!(optimized_array.scalar_at(1).unwrap(), "tiny".into());
1✔
150
    }
1✔
151

152
    #[test]
153
    fn test_optimize_with_long_strings() {
1✔
154
        // Create strings that are definitely longer than 12 bytes
155
        let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
1✔
156
        let long_string_2 = "another extremely long string that also needs external buffer storage";
1✔
157
        let long_string_3 = "yet another long string for testing buffer compaction functionality";
1✔
158

159
        let original = VarBinViewArray::from_iter_str([
1✔
160
            long_string_1,
1✔
161
            long_string_2,
1✔
162
            long_string_3,
1✔
163
            "short1",
1✔
164
            "short2",
1✔
165
        ]);
1✔
166

167
        // Take only the first and third long strings (indices 0 and 2)
168
        let indices = buffer![0u32, 2u32].into_array();
1✔
169
        let taken = take(original.as_ref(), &indices).unwrap();
1✔
170
        let taken_array = taken.as_::<VarBinViewVTable>();
1✔
171

172
        // Optimize the taken array
173
        let optimized_array = compact_buffers(taken_array).unwrap();
1✔
174

175
        // The optimized array should have exactly 1 buffer (consolidated)
176
        assert_eq!(optimized_array.nbuffers(), 1);
1✔
177

178
        // Verify the data is still correct
179
        assert_eq!(optimized_array.len(), 2);
1✔
180
        assert_eq!(optimized_array.scalar_at(0).unwrap(), long_string_1.into());
1✔
181
        assert_eq!(optimized_array.scalar_at(1).unwrap(), long_string_3.into());
1✔
182
    }
1✔
183

184
    #[test]
185
    fn test_optimize_no_buffers() {
1✔
186
        // Create an array with only short strings (all inlined)
187
        let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
1✔
188

189
        // This should have no buffers
190
        assert_eq!(original.nbuffers(), 0);
1✔
191

192
        // Optimize should return the same array
193
        let optimized_array = compact_buffers(&original).unwrap();
1✔
194

195
        assert_eq!(optimized_array.nbuffers(), 0);
1✔
196
        assert_eq!(optimized_array.len(), 4);
1✔
197

198
        // Verify all values are preserved
199
        for i in 0..4 {
5✔
200
            assert_eq!(
4✔
201
                optimized_array.scalar_at(i).unwrap(),
4✔
202
                original.scalar_at(i).unwrap()
4✔
203
            );
204
        }
205
    }
1✔
206

207
    #[test]
208
    fn test_optimize_single_buffer() {
1✔
209
        // Create an array that naturally has only one buffer
210
        let str1 = "this is a long string that goes into a buffer";
1✔
211
        let str2 = "another long string in the same buffer";
1✔
212
        let original = VarBinViewArray::from_iter_str([str1, str2]);
1✔
213

214
        // Should have 1 compact buffer
215
        assert_eq!(original.nbuffers(), 1);
1✔
216
        assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
1✔
217

218
        // Optimize should return the same array (no change needed)
219
        let optimized_array = compact_buffers(&original).unwrap();
1✔
220

221
        assert_eq!(optimized_array.nbuffers(), 1);
1✔
222
        assert_eq!(optimized_array.len(), 2);
1✔
223

224
        // Verify all values are preserved
225
        for i in 0..2 {
3✔
226
            assert_eq!(
2✔
227
                optimized_array.scalar_at(i).unwrap(),
2✔
228
                original.scalar_at(i).unwrap()
2✔
229
            );
230
        }
231
    }
1✔
232
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc