• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 17075133033

19 Aug 2025 04:01PM UTC coverage: 87.949% (+0.09%) from 87.856%
17075133033

push

github

web-flow
feat: ArrayOperations infallible, eager validation + new_unchecked (#4177)

ArrayOperations currently return VortexResult<>, but they really should
just be infallible. A failed array op is generally indicative of
programmer or encoding error. There's really nothing interesting we can
do to handle an out-of-bounds slice() or scalar_at.

There's a lot that falls out of this, like fixing a bunch of tests,
tweaking our scalar value casting to return Option instead of Result,
etc.

---------

Signed-off-by: Andrew Duffy <andrew@a10y.dev>

1744 of 1985 new or added lines in 195 files covered. (87.86%)

36 existing lines in 27 files now uncovered.

56745 of 64520 relevant lines covered (87.95%)

624082.56 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.29
/vortex-array/src/arrays/chunked/array.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
//! First-class chunked arrays.
5
//!
6
//! Vortex is a chunked array library that's able to
7

8
use std::fmt::Debug;
9

10
use futures_util::stream;
11
use itertools::Itertools;
12
use vortex_buffer::{Buffer, BufferMut};
13
use vortex_dtype::DType;
14
use vortex_error::{VortexExpect as _, VortexResult, VortexUnwrap, vortex_bail};
15
use vortex_mask::Mask;
16

17
use crate::arrays::ChunkedVTable;
18
use crate::iter::{ArrayIterator, ArrayIteratorAdapter};
19
use crate::search_sorted::{SearchSorted, SearchSortedSide};
20
use crate::stats::{ArrayStats, StatsSetRef};
21
use crate::stream::{ArrayStream, ArrayStreamAdapter};
22
use crate::vtable::{ArrayVTable, ValidityVTable};
23
use crate::{Array, ArrayRef, IntoArray};
24

25
#[derive(Clone, Debug)]
26
pub struct ChunkedArray {
27
    dtype: DType,
28
    len: usize,
29
    chunk_offsets: Buffer<u64>,
30
    chunks: Vec<ArrayRef>,
31
    stats_set: ArrayStats,
32
}
33

34
impl ChunkedArray {
35
    pub fn try_new(chunks: Vec<ArrayRef>, dtype: DType) -> VortexResult<Self> {
15,173✔
36
        for chunk in &chunks {
76,331✔
37
            if chunk.dtype() != &dtype {
61,158✔
38
                vortex_bail!(MismatchedTypes: dtype, chunk.dtype());
×
39
            }
61,158✔
40
        }
41

42
        // SAFETY: validation done above
43
        unsafe { Ok(Self::new_unchecked(chunks, dtype)) }
15,173✔
44
    }
15,173✔
45

46
    /// Create a new `ChunkedArray` from a set of chunks without verifying that all chunks have
47
    /// the same DType.
48
    ///
49
    /// # Safety
50
    ///
51
    /// The caller must ensure that all chunks have the same DType, else downstream operations
52
    /// may break correctness assumptions about `ChunkedArray` child types.
53
    pub unsafe fn new_unchecked(chunks: Vec<ArrayRef>, dtype: DType) -> Self {
15,437✔
54
        let nchunks = chunks.len();
15,437✔
55

56
        let mut chunk_offsets = BufferMut::<u64>::with_capacity(nchunks + 1);
15,437✔
57
        // SAFETY: nchunks + 1
58
        unsafe { chunk_offsets.push_unchecked(0) }
15,437✔
59
        let mut curr_offset = 0;
15,437✔
60
        for c in &chunks {
77,203✔
61
            curr_offset += c.len() as u64;
61,766✔
62
            // SAFETY: nchunks + 1
63
            unsafe { chunk_offsets.push_unchecked(curr_offset) }
61,766✔
64
        }
65

66
        Self {
15,437✔
67
            dtype,
15,437✔
68
            len: curr_offset.try_into().vortex_unwrap(),
15,437✔
69
            chunk_offsets: chunk_offsets.freeze(),
15,437✔
70
            chunks,
15,437✔
71
            stats_set: Default::default(),
15,437✔
72
        }
15,437✔
73
    }
15,437✔
74

75
    #[inline]
76
    pub fn chunk(&self, idx: usize) -> &ArrayRef {
55,799✔
77
        assert!(idx < self.nchunks(), "chunk index {idx} out of bounds");
55,799✔
78

79
        &self.chunks[idx]
55,799✔
80
    }
55,799✔
81

82
    pub fn nchunks(&self) -> usize {
145,156✔
83
        self.chunks.len()
145,156✔
84
    }
145,156✔
85

86
    #[inline]
87
    pub fn chunk_offsets(&self) -> &Buffer<u64> {
117,224✔
88
        &self.chunk_offsets
117,224✔
89
    }
117,224✔
90

91
    pub(crate) fn find_chunk_idx(&self, index: usize) -> (usize, usize) {
58,497✔
92
        assert!(index <= self.len(), "Index out of bounds of the array");
58,497✔
93
        let index = index as u64;
58,497✔
94

95
        // Since there might be duplicate values in offsets because of empty chunks we want to search from right
96
        // and take the last chunk (we subtract 1 since there's a leading 0)
97
        let index_chunk = self
58,497✔
98
            .chunk_offsets()
58,497✔
99
            .search_sorted(&index, SearchSortedSide::Right)
58,497✔
100
            .to_ends_index(self.nchunks() + 1)
58,497✔
101
            .saturating_sub(1);
58,497✔
102
        let chunk_start = self.chunk_offsets()[index_chunk];
58,497✔
103

104
        let index_in_chunk =
58,497✔
105
            usize::try_from(index - chunk_start).vortex_expect("Index is too large for usize");
58,497✔
106
        (index_chunk, index_in_chunk)
58,497✔
107
    }
58,497✔
108

109
    pub fn chunks(&self) -> &[ArrayRef] {
17,637✔
110
        &self.chunks
17,637✔
111
    }
17,637✔
112

113
    pub fn non_empty_chunks(&self) -> impl Iterator<Item = &ArrayRef> + '_ {
487✔
114
        self.chunks().iter().filter(|c| !c.is_empty())
1,255✔
115
    }
487✔
116

117
    pub fn array_iterator(&self) -> impl ArrayIterator + '_ {
901✔
118
        ArrayIteratorAdapter::new(self.dtype().clone(), self.chunks().iter().cloned().map(Ok))
901✔
119
    }
901✔
120

121
    pub fn array_stream(&self) -> impl ArrayStream + '_ {
×
122
        ArrayStreamAdapter::new(
×
123
            self.dtype().clone(),
×
124
            stream::iter(self.chunks().iter().cloned().map(Ok)),
×
125
        )
126
    }
×
127

128
    pub fn rechunk(&self, target_bytesize: u64, target_rowsize: usize) -> VortexResult<Self> {
4✔
129
        let mut new_chunks = Vec::new();
4✔
130
        let mut chunks_to_combine = Vec::new();
4✔
131
        let mut new_chunk_n_bytes = 0;
4✔
132
        let mut new_chunk_n_elements = 0;
4✔
133
        for chunk in self.chunks() {
10✔
134
            let n_bytes = chunk.nbytes();
10✔
135
            let n_elements = chunk.len();
10✔
136

137
            if (new_chunk_n_bytes + n_bytes > target_bytesize
10✔
138
                || new_chunk_n_elements + n_elements > target_rowsize)
10✔
139
                && !chunks_to_combine.is_empty()
3✔
140
            {
141
                new_chunks.push(
3✔
142
                    // SAFETY: combining chunks of same type maintains valid chunk types
143
                    unsafe {
144
                        ChunkedArray::new_unchecked(chunks_to_combine, self.dtype().clone())
3✔
145
                            .to_canonical()?
3✔
146
                            .into_array()
3✔
147
                    },
148
                );
149

150
                new_chunk_n_bytes = 0;
3✔
151
                new_chunk_n_elements = 0;
3✔
152
                chunks_to_combine = Vec::new();
3✔
153
            }
7✔
154

155
            if n_bytes > target_bytesize || n_elements > target_rowsize {
10✔
156
                new_chunks.push(chunk.clone());
1✔
157
            } else {
9✔
158
                new_chunk_n_bytes += n_bytes;
9✔
159
                new_chunk_n_elements += n_elements;
9✔
160
                chunks_to_combine.push(chunk.clone());
9✔
161
            }
9✔
162
        }
163

164
        if !chunks_to_combine.is_empty() {
4✔
165
            new_chunks.push(unsafe {
4✔
166
                // SAFETY: combining chunks of same type maintains valid chunk types
167
                ChunkedArray::new_unchecked(chunks_to_combine, self.dtype().clone())
4✔
168
                    .to_canonical()?
4✔
169
                    .into_array()
4✔
170
            });
UNCOV
171
        }
×
172

173
        // SAFETY: combining chunks of same type maintains valid chunk types
174
        unsafe { Ok(Self::new_unchecked(new_chunks, self.dtype().clone())) }
4✔
175
    }
4✔
176
}
177

178
impl FromIterator<ArrayRef> for ChunkedArray {
179
    fn from_iter<T: IntoIterator<Item = ArrayRef>>(iter: T) -> Self {
28✔
180
        let chunks: Vec<ArrayRef> = iter.into_iter().collect();
28✔
181
        let dtype = chunks
28✔
182
            .first()
28✔
183
            .map(|c| c.dtype().clone())
28✔
184
            .vortex_expect("Cannot infer DType from an empty iterator");
28✔
185
        Self::try_new(chunks, dtype).vortex_expect("Failed to create chunked array from iterator")
28✔
186
    }
28✔
187
}
188

189
impl ArrayVTable<ChunkedVTable> for ChunkedVTable {
190
    fn len(array: &ChunkedArray) -> usize {
200,876✔
191
        array.len
200,876✔
192
    }
200,876✔
193

194
    fn dtype(array: &ChunkedArray) -> &DType {
100,965✔
195
        &array.dtype
100,965✔
196
    }
100,965✔
197

198
    fn stats(array: &ChunkedArray) -> StatsSetRef<'_> {
24,957✔
199
        array.stats_set.to_ref(array.as_ref())
24,957✔
200
    }
24,957✔
201
}
202

203
impl ValidityVTable<ChunkedVTable> for ChunkedVTable {
204
    fn is_valid(array: &ChunkedArray, index: usize) -> VortexResult<bool> {
49,221✔
205
        if !array.dtype.is_nullable() {
49,221✔
206
            return Ok(true);
41,083✔
207
        }
8,138✔
208
        let (chunk, offset_in_chunk) = array.find_chunk_idx(index);
8,138✔
209
        array.chunk(chunk).is_valid(offset_in_chunk)
8,138✔
210
    }
49,221✔
211

212
    fn all_valid(array: &ChunkedArray) -> VortexResult<bool> {
25✔
213
        if !array.dtype().is_nullable() {
25✔
214
            return Ok(true);
16✔
215
        }
9✔
216
        for chunk in array.non_empty_chunks() {
13✔
217
            if !chunk.all_valid()? {
13✔
218
                return Ok(false);
7✔
219
            }
6✔
220
        }
221
        Ok(true)
2✔
222
    }
25✔
223

224
    fn all_invalid(array: &ChunkedArray) -> VortexResult<bool> {
460✔
225
        if !array.dtype().is_nullable() {
460✔
226
            return Ok(false);
447✔
227
        }
13✔
228
        for chunk in array.non_empty_chunks() {
15✔
229
            if !chunk.all_invalid()? {
15✔
230
                return Ok(false);
11✔
231
            }
4✔
232
        }
233
        Ok(true)
2✔
234
    }
460✔
235

236
    fn validity_mask(array: &ChunkedArray) -> VortexResult<Mask> {
907✔
237
        array
907✔
238
            .chunks()
907✔
239
            .iter()
907✔
240
            .map(|a| a.validity_mask())
2,206✔
241
            .try_collect()
907✔
242
    }
907✔
243
}
244

245
#[cfg(test)]
246
mod test {
247
    use vortex_buffer::buffer;
248
    use vortex_dtype::{DType, Nullability, PType};
249
    use vortex_error::VortexResult;
250

251
    use crate::array::Array;
252
    use crate::arrays::chunked::ChunkedArray;
253
    use crate::arrays::{ChunkedVTable, PrimitiveArray};
254
    use crate::compute::sub_scalar;
255
    use crate::validity::Validity;
256
    use crate::{IntoArray, ToCanonical, assert_arrays_eq};
257

258
    fn chunked_array() -> ChunkedArray {
1✔
259
        ChunkedArray::try_new(
1✔
260
            vec![
1✔
261
                buffer![1u64, 2, 3].into_array(),
1✔
262
                buffer![4u64, 5, 6].into_array(),
1✔
263
                buffer![7u64, 8, 9].into_array(),
1✔
264
            ],
265
            DType::Primitive(PType::U64, Nullability::NonNullable),
1✔
266
        )
267
        .unwrap()
1✔
268
    }
1✔
269

270
    #[test]
271
    fn test_scalar_subtract() {
1✔
272
        let chunked = chunked_array().into_array();
1✔
273
        let to_subtract = 1u64;
1✔
274
        let array = sub_scalar(&chunked, to_subtract.into()).unwrap();
1✔
275

276
        let chunked = array.as_::<ChunkedVTable>();
1✔
277
        let chunks_out = chunked.chunks();
1✔
278

279
        let results = chunks_out[0]
1✔
280
            .to_primitive()
1✔
281
            .unwrap()
1✔
282
            .as_slice::<u64>()
1✔
283
            .to_vec();
1✔
284
        assert_eq!(results, &[0u64, 1, 2]);
1✔
285
        let results = chunks_out[1]
1✔
286
            .to_primitive()
1✔
287
            .unwrap()
1✔
288
            .as_slice::<u64>()
1✔
289
            .to_vec();
1✔
290
        assert_eq!(results, &[3u64, 4, 5]);
1✔
291
        let results = chunks_out[2]
1✔
292
            .to_primitive()
1✔
293
            .unwrap()
1✔
294
            .as_slice::<u64>()
1✔
295
            .to_vec();
1✔
296
        assert_eq!(results, &[6u64, 7, 8]);
1✔
297
    }
1✔
298

299
    #[test]
300
    fn test_rechunk_one_chunk() {
1✔
301
        let chunked = ChunkedArray::try_new(
1✔
302
            vec![buffer![0u64].into_array()],
1✔
303
            DType::Primitive(PType::U64, Nullability::NonNullable),
1✔
304
        )
305
        .unwrap();
1✔
306

307
        let rechunked = chunked.rechunk(1 << 16, 1 << 16).unwrap();
1✔
308

309
        assert_arrays_eq!(chunked, rechunked);
1✔
310
    }
1✔
311

312
    #[test]
313
    fn test_rechunk_two_chunks() {
1✔
314
        let chunked = ChunkedArray::try_new(
1✔
315
            vec![buffer![0u64].into_array(), buffer![5u64].into_array()],
1✔
316
            DType::Primitive(PType::U64, Nullability::NonNullable),
1✔
317
        )
318
        .unwrap();
1✔
319

320
        let rechunked = chunked.rechunk(1 << 16, 1 << 16).unwrap();
1✔
321

322
        assert_eq!(rechunked.nchunks(), 1);
1✔
323
        assert_arrays_eq!(chunked, rechunked);
1✔
324
    }
1✔
325

326
    #[test]
327
    fn test_rechunk_tiny_target_chunks() {
1✔
328
        let chunked = ChunkedArray::try_new(
1✔
329
            vec![
1✔
330
                buffer![0u64, 1, 2, 3].into_array(),
1✔
331
                buffer![4u64, 5].into_array(),
1✔
332
            ],
333
            DType::Primitive(PType::U64, Nullability::NonNullable),
1✔
334
        )
335
        .unwrap();
1✔
336

337
        let rechunked = chunked.rechunk(1 << 16, 5).unwrap();
1✔
338

339
        assert_eq!(rechunked.nchunks(), 2);
1✔
340
        assert!(rechunked.chunks().iter().all(|c| c.len() < 5));
2✔
341
        assert_arrays_eq!(chunked, rechunked);
1✔
342
    }
1✔
343

344
    #[test]
345
    fn test_rechunk_with_too_big_chunk() {
1✔
346
        let chunked = ChunkedArray::try_new(
1✔
347
            vec![
1✔
348
                buffer![0u64, 1, 2].into_array(),
1✔
349
                buffer![42_u64; 6].into_array(),
1✔
350
                buffer![4u64, 5].into_array(),
1✔
351
                buffer![6u64, 7].into_array(),
1✔
352
                buffer![8u64, 9].into_array(),
1✔
353
            ],
354
            DType::Primitive(PType::U64, Nullability::NonNullable),
1✔
355
        )
356
        .unwrap();
1✔
357

358
        let rechunked = chunked.rechunk(1 << 16, 5).unwrap();
1✔
359
        // greedy so should be: [0, 1, 2] [42, 42, 42, 42, 42, 42] [4, 5, 6, 7] [8, 9]
360

361
        assert_eq!(rechunked.nchunks(), 4);
1✔
362
        assert_arrays_eq!(chunked, rechunked);
1✔
363
    }
1✔
364

365
    #[test]
366
    fn test_empty_chunks_all_valid() -> VortexResult<()> {
1✔
367
        // Create chunks where some are empty but all non-empty chunks have all valid values
368
        let chunks = vec![
1✔
369
            PrimitiveArray::new(buffer![1u64, 2, 3], Validity::AllValid).into_array(),
1✔
370
            PrimitiveArray::new(buffer![0u64; 0], Validity::AllValid).into_array(), // empty chunk
1✔
371
            PrimitiveArray::new(buffer![4u64, 5], Validity::AllValid).into_array(),
1✔
372
            PrimitiveArray::new(buffer![0u64; 0], Validity::AllValid).into_array(), // empty chunk
1✔
373
        ];
374

375
        let chunked =
1✔
376
            ChunkedArray::try_new(chunks, DType::Primitive(PType::U64, Nullability::Nullable))?;
1✔
377

378
        // Should be all_valid since all non-empty chunks are all_valid
379
        assert!(chunked.all_valid()?);
1✔
380
        assert!(!chunked.all_invalid()?);
1✔
381

382
        Ok(())
1✔
383
    }
1✔
384

385
    #[test]
386
    fn test_empty_chunks_all_invalid() -> VortexResult<()> {
1✔
387
        // Create chunks where some are empty but all non-empty chunks have all invalid values
388
        let chunks = vec![
1✔
389
            PrimitiveArray::new(buffer![1u64, 2], Validity::AllInvalid).into_array(),
1✔
390
            PrimitiveArray::new(buffer![0u64; 0], Validity::AllInvalid).into_array(), // empty chunk
1✔
391
            PrimitiveArray::new(buffer![3u64, 4, 5], Validity::AllInvalid).into_array(),
1✔
392
            PrimitiveArray::new(buffer![0u64; 0], Validity::AllInvalid).into_array(), // empty chunk
1✔
393
        ];
394

395
        let chunked =
1✔
396
            ChunkedArray::try_new(chunks, DType::Primitive(PType::U64, Nullability::Nullable))?;
1✔
397

398
        // Should be all_invalid since all non-empty chunks are all_invalid
399
        assert!(!chunked.all_valid()?);
1✔
400
        assert!(chunked.all_invalid()?);
1✔
401

402
        Ok(())
1✔
403
    }
1✔
404

405
    #[test]
406
    fn test_empty_chunks_mixed_validity() -> VortexResult<()> {
1✔
407
        // Create chunks with mixed validity including empty chunks
408
        let chunks = vec![
1✔
409
            PrimitiveArray::new(buffer![1u64, 2], Validity::AllValid).into_array(),
1✔
410
            PrimitiveArray::new(buffer![0u64; 0], Validity::AllValid).into_array(), // empty chunk
1✔
411
            PrimitiveArray::new(buffer![3u64, 4], Validity::AllInvalid).into_array(),
1✔
412
            PrimitiveArray::new(buffer![0u64; 0], Validity::AllInvalid).into_array(), // empty chunk
1✔
413
        ];
414

415
        let chunked =
1✔
416
            ChunkedArray::try_new(chunks, DType::Primitive(PType::U64, Nullability::Nullable))?;
1✔
417

418
        // Should be neither all_valid nor all_invalid
419
        assert!(!chunked.all_valid()?);
1✔
420
        assert!(!chunked.all_invalid()?);
1✔
421

422
        Ok(())
1✔
423
    }
1✔
424
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc