• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16387130526

19 Jul 2025 09:05AM UTC coverage: 81.512% (-0.008%) from 81.52%
16387130526

push

github

web-flow
feat: duckdb workstealing (#3927)

Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>

16 of 17 new or added lines in 1 file covered. (94.12%)

185 existing lines in 8 files now uncovered.

42000 of 51526 relevant lines covered (81.51%)

171508.11 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

86.76
/vortex-array/src/canonical.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
//! Encodings that enable zero-copy sharing of data with Arrow.
5

6
use vortex_dtype::DType;
7
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
8

9
use crate::arrays::{
10
    BoolArray, DecimalArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray,
11
    VarBinViewArray, compact_buffers,
12
};
13
use crate::builders::builder_with_capacity;
14
use crate::{Array, ArrayRef, IntoArray};
15

16
/// An enum capturing the default uncompressed encodings for each [Vortex type][DType].
17
///
18
/// Any array can be decoded into canonical form via the [`to_canonical`][Array::to_canonical]
19
/// trait method. This is the simplest encoding for a type, and will not be compressed but may
20
/// contain compressed child arrays.
21
///
22
/// Canonical form is useful for doing type-specific compute where you need to know that all
23
/// elements are laid out decompressed and contiguous in memory.
24
///
25
/// # Laziness
26
///
27
/// Canonical form is not recursive, so while a `StructArray` is the canonical format for any
28
/// `Struct` type, individual column child arrays may still be compressed. This allows
29
/// compute over Vortex arrays to push decoding as late as possible, and ideally many child arrays
30
/// never need to be decoded into canonical form at all depending on the compute.
31
///
32
/// # Arrow interoperability
33
///
34
/// All of the Vortex canonical encodings have an equivalent Arrow encoding that can be built
35
/// zero-copy, and the corresponding Arrow array types can also be built directly.
36
///
37
/// The full list of canonical types and their equivalent Arrow array types are:
38
///
39
/// * `NullArray`: [`arrow_array::NullArray`]
40
/// * `BoolArray`: [`arrow_array::BooleanArray`]
41
/// * `PrimitiveArray`: [`arrow_array::PrimitiveArray`]
42
/// * `DecimalArray`: [`arrow_array::Decimal128Array`] and [`arrow_array::Decimal256Array`]
43
/// * `StructArray`: [`arrow_array::StructArray`]
44
/// * `ListArray`: [`arrow_array::ListArray`]
45
/// * `VarBinViewArray`: [`arrow_array::GenericByteViewArray`]
46
///
47
/// Vortex uses a logical type system, unlike Arrow which uses physical encodings for its types.
48
/// As an example, there are at least six valid physical encodings for a `Utf8` array. This can
49
/// create ambiguity.
50
/// Thus, if you receive an Arrow array, compress it using Vortex, and then
51
/// decompress it later to pass to a compute kernel, there are multiple suitable Arrow array
52
/// variants to hold the data.
53
///
54
/// To disambiguate, we choose a canonical physical encoding for every Vortex [`DType`], which
55
/// will correspond to an arrow-rs [`arrow_schema::DataType`].
56
///
57
/// # Views support
58
///
59
/// Binary and String views, also known as "German strings" are a better encoding format for
60
/// nearly all use-cases. Variable-length binary views are part of the Apache Arrow spec, and are
61
/// fully supported by the Datafusion query engine. We use them as our canonical string encoding
62
/// for all `Utf8` and `Binary` typed arrays in Vortex. They provide considerably faster filter
63
/// execution than the core `StringArray` and `BinaryArray` types, at the expense of potentially
64
/// needing [garbage collection][arrow_array::GenericByteViewArray::gc] to clear unreferenced items
65
/// from memory.
66
#[derive(Debug, Clone)]
67
pub enum Canonical {
68
    Null(NullArray),
69
    Bool(BoolArray),
70
    Primitive(PrimitiveArray),
71
    Decimal(DecimalArray),
72
    Struct(StructArray),
73
    // TODO(joe): maybe this should be a ListView, however this will be annoying in spiral
74
    List(ListArray),
75
    VarBinView(VarBinViewArray),
76
    Extension(ExtensionArray),
77
}
78

79
impl Canonical {
80
    /// Create an empty canonical array of the given dtype.
81
    pub fn empty(dtype: &DType) -> Canonical {
564✔
82
        builder_with_capacity(dtype, 0)
564✔
83
            .finish()
564✔
84
            .to_canonical()
564✔
85
            .vortex_expect("cannot fail to convert an empty array to canonical")
564✔
86
    }
564✔
87
}
88

89
impl Canonical {
90
    /// Performs a (potentially expensive) compaction operation on the array before it is complete.
91
    ///
92
    /// This is mostly relevant for the variable-length types such as Utf8, Binary or List where
93
    /// they can accumulate wasted space after slicing and taking operations.
94
    ///
95
    /// This operation is very expensive and can result in things like allocations, full-scans
96
    /// and copy operations.
97
    pub fn compact(&self) -> VortexResult<Canonical> {
20,970✔
98
        match self {
20,970✔
99
            Canonical::VarBinView(array) => Ok(Canonical::VarBinView(compact_buffers(array)?)),
3,504✔
100
            other => Ok(other.clone()),
17,466✔
101
        }
102
    }
20,970✔
103
}
104

105
// Unwrap canonical type back down to specialized type.
106
impl Canonical {
107
    pub fn into_null(self) -> VortexResult<NullArray> {
3✔
108
        match self {
3✔
109
            Canonical::Null(a) => Ok(a),
3✔
UNCOV
110
            _ => vortex_bail!("Cannot unwrap NullArray from {:?}", &self),
×
111
        }
112
    }
3✔
113

114
    pub fn into_bool(self) -> VortexResult<BoolArray> {
44,774✔
115
        match self {
44,774✔
116
            Canonical::Bool(a) => Ok(a),
44,774✔
UNCOV
117
            _ => vortex_bail!("Cannot unwrap BoolArray from {:?}", &self),
×
118
        }
119
    }
44,774✔
120

121
    pub fn into_primitive(self) -> VortexResult<PrimitiveArray> {
125,806✔
122
        match self {
125,806✔
123
            Canonical::Primitive(a) => Ok(a),
125,806✔
UNCOV
124
            _ => vortex_bail!("Cannot unwrap PrimitiveArray from {:?}", &self),
×
125
        }
126
    }
125,806✔
127

128
    pub fn into_decimal(self) -> VortexResult<DecimalArray> {
735✔
129
        match self {
735✔
130
            Canonical::Decimal(a) => Ok(a),
735✔
UNCOV
131
            _ => vortex_bail!("Cannot unwrap DecimalArray from {:?}", &self),
×
132
        }
133
    }
735✔
134

135
    pub fn into_struct(self) -> VortexResult<StructArray> {
23,185✔
136
        match self {
23,185✔
137
            Canonical::Struct(a) => Ok(a),
23,185✔
UNCOV
138
            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
×
139
        }
140
    }
23,185✔
141

142
    pub fn into_list(self) -> VortexResult<ListArray> {
312✔
143
        match self {
312✔
144
            Canonical::List(a) => Ok(a),
312✔
145
            _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self),
×
146
        }
147
    }
312✔
148

149
    pub fn into_varbinview(self) -> VortexResult<VarBinViewArray> {
14,412✔
150
        match self {
14,412✔
151
            Canonical::VarBinView(a) => Ok(a),
14,412✔
UNCOV
152
            _ => vortex_bail!("Cannot unwrap VarBinViewArray from {:?}", &self),
×
153
        }
154
    }
14,412✔
155

UNCOV
156
    pub fn into_extension(self) -> VortexResult<ExtensionArray> {
×
UNCOV
157
        match self {
×
UNCOV
158
            Canonical::Extension(a) => Ok(a),
×
UNCOV
159
            _ => vortex_bail!("Cannot unwrap ExtensionArray from {:?}", &self),
×
160
        }
UNCOV
161
    }
×
162
}
163

164
impl AsRef<dyn Array> for Canonical {
165
    fn as_ref(&self) -> &(dyn Array + 'static) {
940,854✔
166
        match &self {
940,854✔
167
            Canonical::Null(a) => a.as_ref(),
9✔
168
            Canonical::Bool(a) => a.as_ref(),
158,083✔
169
            Canonical::Primitive(a) => a.as_ref(),
562,724✔
170
            Canonical::Decimal(a) => a.as_ref(),
26,209✔
171
            Canonical::Struct(a) => a.as_ref(),
91,935✔
172
            Canonical::List(a) => a.as_ref(),
1,867✔
173
            Canonical::VarBinView(a) => a.as_ref(),
92,641✔
174
            Canonical::Extension(a) => a.as_ref(),
7,386✔
175
        }
176
    }
940,854✔
177
}
178

179
impl IntoArray for Canonical {
180
    fn into_array(self) -> ArrayRef {
17,640✔
181
        match self {
17,640✔
UNCOV
182
            Canonical::Null(a) => a.into_array(),
×
183
            Canonical::Bool(a) => a.into_array(),
371✔
184
            Canonical::Primitive(a) => a.into_array(),
9,215✔
185
            Canonical::Decimal(a) => a.into_array(),
2,186✔
186
            Canonical::Struct(a) => a.into_array(),
186✔
187
            Canonical::List(a) => a.into_array(),
149✔
188
            Canonical::VarBinView(a) => a.into_array(),
4,535✔
189
            Canonical::Extension(a) => a.into_array(),
998✔
190
        }
191
    }
17,640✔
192
}
193

194
/// Trait for types that can be converted from an owned type into an owned array variant.
195
///
196
/// # Canonicalization
197
///
198
/// This trait has a blanket implementation for all types implementing [ToCanonical].
199
pub trait ToCanonical {
200
    /// Canonicalize into a [`NullArray`] if the target is [`Null`][DType::Null] typed.
201
    fn to_null(&self) -> VortexResult<NullArray>;
202

203
    /// Canonicalize into a [`BoolArray`] if the target is [`Bool`][DType::Bool] typed.
204
    fn to_bool(&self) -> VortexResult<BoolArray>;
205

206
    /// Canonicalize into a [`PrimitiveArray`] if the target is [`Primitive`][DType::Primitive]
207
    /// typed.
208
    fn to_primitive(&self) -> VortexResult<PrimitiveArray>;
209

210
    /// Canonicalize into a [`DecimalArray`] if the target is [`Decimal`][DType::Decimal]
211
    /// typed.
212
    fn to_decimal(&self) -> VortexResult<DecimalArray>;
213

214
    /// Canonicalize into a [`StructArray`] if the target is [`Struct`][DType::Struct] typed.
215
    fn to_struct(&self) -> VortexResult<StructArray>;
216

217
    /// Canonicalize into a [`ListArray`] if the target is [`List`][DType::List] typed.
218
    fn to_list(&self) -> VortexResult<ListArray>;
219

220
    /// Canonicalize into a [`VarBinViewArray`] if the target is [`Utf8`][DType::Utf8]
221
    /// or [`Binary`][DType::Binary] typed.
222
    fn to_varbinview(&self) -> VortexResult<VarBinViewArray>;
223

224
    /// Canonicalize into an [`ExtensionArray`] if the array is [`Extension`][DType::Extension]
225
    /// typed.
226
    fn to_extension(&self) -> VortexResult<ExtensionArray>;
227
}
228

229
// Blanket impl for all Array encodings.
230
impl<A: Array + ?Sized> ToCanonical for A {
231
    fn to_null(&self) -> VortexResult<NullArray> {
3✔
232
        self.to_canonical()?.into_null()
3✔
233
    }
3✔
234

235
    fn to_bool(&self) -> VortexResult<BoolArray> {
43,994✔
236
        self.to_canonical()?.into_bool()
43,994✔
237
    }
43,994✔
238

239
    fn to_primitive(&self) -> VortexResult<PrimitiveArray> {
123,235✔
240
        self.to_canonical()?.into_primitive()
123,235✔
241
    }
123,235✔
242

243
    fn to_decimal(&self) -> VortexResult<DecimalArray> {
735✔
244
        self.to_canonical()?.into_decimal()
735✔
245
    }
735✔
246

247
    fn to_struct(&self) -> VortexResult<StructArray> {
23,074✔
248
        self.to_canonical()?.into_struct()
23,074✔
249
    }
23,074✔
250

251
    fn to_list(&self) -> VortexResult<ListArray> {
304✔
252
        self.to_canonical()?.into_list()
304✔
253
    }
304✔
254

255
    fn to_varbinview(&self) -> VortexResult<VarBinViewArray> {
14,411✔
256
        self.to_canonical()?.into_varbinview()
14,411✔
257
    }
14,411✔
258

UNCOV
259
    fn to_extension(&self) -> VortexResult<ExtensionArray> {
×
260
        self.to_canonical()?.into_extension()
×
UNCOV
261
    }
×
262
}
263

264
impl From<Canonical> for ArrayRef {
UNCOV
265
    fn from(value: Canonical) -> Self {
×
UNCOV
266
        match value {
×
UNCOV
267
            Canonical::Null(a) => a.into_array(),
×
UNCOV
268
            Canonical::Bool(a) => a.into_array(),
×
UNCOV
269
            Canonical::Primitive(a) => a.into_array(),
×
UNCOV
270
            Canonical::Decimal(a) => a.into_array(),
×
UNCOV
271
            Canonical::Struct(a) => a.into_array(),
×
UNCOV
272
            Canonical::List(a) => a.into_array(),
×
UNCOV
273
            Canonical::VarBinView(a) => a.into_array(),
×
UNCOV
274
            Canonical::Extension(a) => a.into_array(),
×
275
        }
UNCOV
276
    }
×
277
}
278

279
#[cfg(test)]
280
mod test {
281
    use std::sync::Arc;
282

283
    use arrow_array::cast::AsArray;
284
    use arrow_array::types::{Int32Type, Int64Type, UInt64Type};
285
    use arrow_array::{
286
        Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray,
287
        PrimitiveArray as ArrowPrimitiveArray, StringArray, StringViewArray,
288
        StructArray as ArrowStructArray,
289
    };
290
    use arrow_buffer::{NullBufferBuilder, OffsetBuffer};
291
    use arrow_schema::{DataType, Field};
292
    use vortex_buffer::buffer;
293

294
    use crate::arrays::{ConstantArray, StructArray};
295
    use crate::arrow::{FromArrowArray, IntoArrowArray};
296
    use crate::{ArrayRef, IntoArray};
297

298
    #[test]
299
    fn test_canonicalize_nested_struct() {
1✔
300
        // Create a struct array with multiple internal components.
301
        let nested_struct_array = StructArray::from_fields(&[
1✔
302
            ("a", buffer![1u64].into_array()),
1✔
303
            (
1✔
304
                "b",
1✔
305
                StructArray::from_fields(&[(
1✔
306
                    "inner_a",
1✔
307
                    // The nested struct contains a ConstantArray representing the primitive array
1✔
308
                    //   [100i64]
1✔
309
                    // ConstantArray is not a canonical type, so converting `into_arrow()` should
1✔
310
                    // map this to the nearest canonical type (PrimitiveArray).
1✔
311
                    ConstantArray::new(100i64, 1).into_array(),
1✔
312
                )])
1✔
313
                .unwrap()
1✔
314
                .into_array(),
1✔
315
            ),
1✔
316
        ])
1✔
317
        .unwrap();
1✔
318

319
        let arrow_struct = nested_struct_array
1✔
320
            .into_array()
1✔
321
            .into_arrow_preferred()
1✔
322
            .unwrap()
1✔
323
            .as_any()
1✔
324
            .downcast_ref::<ArrowStructArray>()
1✔
325
            .cloned()
1✔
326
            .unwrap();
1✔
327

328
        assert!(
1✔
329
            arrow_struct
1✔
330
                .column(0)
1✔
331
                .as_any()
1✔
332
                .downcast_ref::<ArrowPrimitiveArray<UInt64Type>>()
1✔
333
                .is_some()
1✔
334
        );
335

336
        let inner_struct = arrow_struct
1✔
337
            .column(1)
1✔
338
            .clone()
1✔
339
            .as_any()
1✔
340
            .downcast_ref::<ArrowStructArray>()
1✔
341
            .cloned()
1✔
342
            .unwrap();
1✔
343

344
        let inner_a = inner_struct
1✔
345
            .column(0)
1✔
346
            .as_any()
1✔
347
            .downcast_ref::<ArrowPrimitiveArray<Int64Type>>();
1✔
348
        assert!(inner_a.is_some());
1✔
349

350
        assert_eq!(
1✔
351
            inner_a.cloned().unwrap(),
1✔
352
            ArrowPrimitiveArray::from_iter([100i64]),
1✔
353
        );
354
    }
1✔
355

356
    #[test]
357
    fn roundtrip_struct() {
1✔
358
        let mut nulls = NullBufferBuilder::new(6);
1✔
359
        nulls.append_n_non_nulls(4);
1✔
360
        nulls.append_null();
1✔
361
        nulls.append_non_null();
1✔
362
        let names = Arc::new(StringViewArray::from_iter(vec![
1✔
363
            Some("Joseph"),
1✔
364
            None,
1✔
365
            Some("Angela"),
1✔
366
            Some("Mikhail"),
1✔
367
            None,
1✔
368
            None,
1✔
369
        ]));
370
        let ages = Arc::new(ArrowPrimitiveArray::<Int32Type>::from(vec![
1✔
371
            Some(25),
1✔
372
            Some(31),
1✔
373
            None,
1✔
374
            Some(57),
1✔
375
            None,
1✔
376
            None,
1✔
377
        ]));
378

379
        let arrow_struct = ArrowStructArray::new(
1✔
380
            vec![
1✔
381
                Arc::new(Field::new("name", DataType::Utf8View, true)),
1✔
382
                Arc::new(Field::new("age", DataType::Int32, true)),
1✔
383
            ]
384
            .into(),
1✔
385
            vec![names, ages],
1✔
386
            nulls.finish(),
1✔
387
        );
388

389
        let vortex_struct = ArrayRef::from_arrow(&arrow_struct, true);
1✔
390

391
        assert_eq!(
1✔
392
            &arrow_struct,
1✔
393
            vortex_struct.into_arrow_preferred().unwrap().as_struct()
1✔
394
        );
395
    }
1✔
396

397
    #[test]
398
    fn roundtrip_list() {
1✔
399
        let names = Arc::new(StringArray::from_iter(vec![
1✔
400
            Some("Joseph"),
1✔
401
            Some("Angela"),
1✔
402
            Some("Mikhail"),
1✔
403
        ]));
404

405
        let arrow_list = ArrowListArray::new(
1✔
406
            Arc::new(Field::new_list_field(DataType::Utf8, true)),
1✔
407
            OffsetBuffer::from_lengths(vec![0, 2, 1]),
1✔
408
            names,
1✔
409
            None,
1✔
410
        );
411
        let list_data_type = arrow_list.data_type();
1✔
412

413
        let vortex_list = ArrayRef::from_arrow(&arrow_list, true);
1✔
414

415
        let rt_arrow_list = vortex_list.into_arrow(list_data_type).unwrap();
1✔
416

417
        assert_eq!(
1✔
418
            (Arc::new(arrow_list.clone()) as ArrowArrayRef).as_ref(),
1✔
419
            rt_arrow_list.as_ref()
1✔
420
        );
421
    }
1✔
422
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc