• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16935267080

13 Aug 2025 11:00AM UTC coverage: 24.312% (-63.3%) from 87.658%
16935267080

Pull #4226

github

web-flow
Merge 81b48c7fb into baa6ea202
Pull Request #4226: Support converting TimestampTZ to and from duckdb

0 of 2 new or added lines in 1 file covered. (0.0%)

20666 existing lines in 469 files now uncovered.

8726 of 35892 relevant lines covered (24.31%)

147.74 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/vortex-btrblocks/src/string.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
use vortex_array::arrays::{VarBinArray, VarBinViewArray, VarBinViewVTable};
5
use vortex_array::vtable::ValidityHelper;
6
use vortex_array::{ArrayRef, IntoArray, ToCanonical};
7
use vortex_dict::DictArray;
8
use vortex_dict::builders::dict_encode;
9
use vortex_error::{VortexExpect, VortexResult};
10
use vortex_fsst::{FSSTArray, fsst_compress, fsst_train_compressor};
11
use vortex_utils::aliases::hash_set::HashSet;
12

13
use crate::integer::IntCompressor;
14
use crate::sample::sample;
15
use crate::{
16
    Compressor, CompressorStats, GenerateStatsOptions, Scheme,
17
    estimate_compression_ratio_with_sampling, integer,
18
};
19

20
#[derive(Clone, Debug)]
21
pub struct StringStats {
22
    src: VarBinViewArray,
23
    estimated_distinct_count: u32,
24
    value_count: u32,
25
    // null_count: u32,
26
}
27

28
/// Estimate the number of distinct strings in the var bin view array.
29
#[allow(clippy::cast_possible_truncation)]
UNCOV
30
fn estimate_distinct_count(strings: &VarBinViewArray) -> u32 {
×
UNCOV
31
    let views = strings.views();
×
32
    // Iterate the views. Two strings which are equal must have the same first 8-bytes.
33
    // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
34
    // share a 4-byte prefix and have the same length.
UNCOV
35
    let mut distinct = HashSet::with_capacity(views.len() / 2);
×
UNCOV
36
    views.iter().for_each(|&view| {
×
UNCOV
37
        let len_and_prefix = view.as_u128() as u64;
×
UNCOV
38
        distinct.insert(len_and_prefix);
×
UNCOV
39
    });
×
40

UNCOV
41
    distinct
×
UNCOV
42
        .len()
×
UNCOV
43
        .try_into()
×
UNCOV
44
        .vortex_expect("distinct count must fit in u32")
×
UNCOV
45
}
×
46

47
impl CompressorStats for StringStats {
48
    type ArrayVTable = VarBinViewVTable;
49

UNCOV
50
    fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self {
×
UNCOV
51
        let null_count = input
×
UNCOV
52
            .statistics()
×
UNCOV
53
            .compute_null_count()
×
UNCOV
54
            .vortex_expect("null count");
×
UNCOV
55
        let value_count = input.len() - null_count;
×
UNCOV
56
        let estimated_distinct = if opts.count_distinct_values {
×
UNCOV
57
            estimate_distinct_count(input)
×
58
        } else {
UNCOV
59
            u32::MAX
×
60
        };
61

UNCOV
62
        Self {
×
UNCOV
63
            src: input.clone(),
×
UNCOV
64
            value_count: value_count.try_into().vortex_expect("value_count"),
×
UNCOV
65
            estimated_distinct_count: estimated_distinct,
×
UNCOV
66
        }
×
UNCOV
67
    }
×
68

UNCOV
69
    fn source(&self) -> &VarBinViewArray {
×
UNCOV
70
        &self.src
×
UNCOV
71
    }
×
72

UNCOV
73
    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
×
UNCOV
74
        let sampled = sample(self.src.as_ref(), sample_size, sample_count)
×
UNCOV
75
            .to_varbinview()
×
UNCOV
76
            .vortex_expect("varbinview");
×
77

UNCOV
78
        Self::generate_opts(&sampled, opts)
×
UNCOV
79
    }
×
80
}
81

82
pub struct StringCompressor;
83

84
impl Compressor for StringCompressor {
85
    type ArrayVTable = VarBinViewVTable;
86
    type SchemeType = dyn StringScheme;
87
    type StatsType = StringStats;
88

UNCOV
89
    fn schemes() -> &'static [&'static Self::SchemeType] {
×
UNCOV
90
        &[&UncompressedScheme, &DictScheme, &FSSTScheme]
×
UNCOV
91
    }
×
92

UNCOV
93
    fn default_scheme() -> &'static Self::SchemeType {
×
UNCOV
94
        &UncompressedScheme
×
UNCOV
95
    }
×
96

UNCOV
97
    fn dict_scheme_code() -> StringCode {
×
UNCOV
98
        DICT_SCHEME
×
UNCOV
99
    }
×
100
}
101

102
pub trait StringScheme: Scheme<StatsType = StringStats, CodeType = StringCode> {}
103

104
impl<T> StringScheme for T where T: Scheme<StatsType = StringStats, CodeType = StringCode> {}
105

106
#[derive(Debug, Copy, Clone)]
107
pub struct UncompressedScheme;
108

109
#[derive(Debug, Copy, Clone)]
110
pub struct DictScheme;
111

112
#[derive(Debug, Copy, Clone)]
113
pub struct FSSTScheme;
114

115
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
116
pub struct StringCode(u8);
117

118
const UNCOMPRESSED_SCHEME: StringCode = StringCode(0);
119
const DICT_SCHEME: StringCode = StringCode(1);
120
const FSST_SCHEME: StringCode = StringCode(2);
121

122
impl Scheme for UncompressedScheme {
123
    type StatsType = StringStats;
124
    type CodeType = StringCode;
125

UNCOV
126
    fn code(&self) -> StringCode {
×
UNCOV
127
        UNCOMPRESSED_SCHEME
×
UNCOV
128
    }
×
129

UNCOV
130
    fn expected_compression_ratio(
×
UNCOV
131
        &self,
×
UNCOV
132
        _stats: &Self::StatsType,
×
UNCOV
133
        _is_sample: bool,
×
UNCOV
134
        _allowed_cascading: usize,
×
UNCOV
135
        _excludes: &[StringCode],
×
UNCOV
136
    ) -> VortexResult<f64> {
×
UNCOV
137
        Ok(1.0)
×
UNCOV
138
    }
×
139

UNCOV
140
    fn compress(
×
UNCOV
141
        &self,
×
UNCOV
142
        stats: &Self::StatsType,
×
UNCOV
143
        _is_sample: bool,
×
UNCOV
144
        _allowed_cascading: usize,
×
UNCOV
145
        _excludes: &[StringCode],
×
UNCOV
146
    ) -> VortexResult<ArrayRef> {
×
UNCOV
147
        Ok(stats.source().to_array())
×
UNCOV
148
    }
×
149
}
150

151
impl Scheme for DictScheme {
152
    type StatsType = StringStats;
153
    type CodeType = StringCode;
154

UNCOV
155
    fn code(&self) -> StringCode {
×
UNCOV
156
        DICT_SCHEME
×
UNCOV
157
    }
×
158

UNCOV
159
    fn expected_compression_ratio(
×
UNCOV
160
        &self,
×
UNCOV
161
        stats: &Self::StatsType,
×
UNCOV
162
        is_sample: bool,
×
UNCOV
163
        allowed_cascading: usize,
×
UNCOV
164
        excludes: &[StringCode],
×
UNCOV
165
    ) -> VortexResult<f64> {
×
166
        // If we don't have a sufficiently high number of distinct values, do not attempt Dict.
UNCOV
167
        if stats.estimated_distinct_count > stats.value_count / 2 {
×
UNCOV
168
            return Ok(0.0);
×
UNCOV
169
        }
×
170

171
        // If array is all null, do not attempt dict.
UNCOV
172
        if stats.value_count == 0 {
×
173
            return Ok(0.0);
×
UNCOV
174
        }
×
175

UNCOV
176
        estimate_compression_ratio_with_sampling(
×
UNCOV
177
            self,
×
UNCOV
178
            stats,
×
UNCOV
179
            is_sample,
×
UNCOV
180
            allowed_cascading,
×
UNCOV
181
            excludes,
×
182
        )
UNCOV
183
    }
×
184

UNCOV
185
    fn compress(
×
UNCOV
186
        &self,
×
UNCOV
187
        stats: &Self::StatsType,
×
UNCOV
188
        is_sample: bool,
×
UNCOV
189
        allowed_cascading: usize,
×
UNCOV
190
        _excludes: &[StringCode],
×
UNCOV
191
    ) -> VortexResult<ArrayRef> {
×
UNCOV
192
        let dict = dict_encode(&stats.source().clone().into_array())?;
×
193

194
        // If we are not allowed to cascade, do not attempt codes or values compression.
UNCOV
195
        if allowed_cascading == 0 {
×
196
            return Ok(dict.into_array());
×
UNCOV
197
        }
×
198

199
        // Find best compressor for codes and values separately
UNCOV
200
        let compressed_codes = IntCompressor::compress(
×
UNCOV
201
            &dict.codes().to_primitive()?,
×
UNCOV
202
            is_sample,
×
UNCOV
203
            allowed_cascading - 1,
×
UNCOV
204
            &[integer::DictScheme.code(), integer::SequenceScheme.code()],
×
205
        )?;
×
206

207
        // Attempt to compress the values with non-Dict compression.
208
        // Currently this will only be FSST.
UNCOV
209
        let compressed_values = StringCompressor::compress(
×
UNCOV
210
            &dict.values().to_varbinview()?,
×
UNCOV
211
            is_sample,
×
UNCOV
212
            allowed_cascading - 1,
×
UNCOV
213
            &[DictScheme.code()],
×
214
        )?;
×
215

UNCOV
216
        Ok(DictArray::try_new(compressed_codes, compressed_values)?.into_array())
×
UNCOV
217
    }
×
218
}
219

220
impl Scheme for FSSTScheme {
221
    type StatsType = StringStats;
222
    type CodeType = StringCode;
223

UNCOV
224
    fn code(&self) -> StringCode {
×
UNCOV
225
        FSST_SCHEME
×
UNCOV
226
    }
×
227

UNCOV
228
    fn compress(
×
UNCOV
229
        &self,
×
UNCOV
230
        stats: &Self::StatsType,
×
UNCOV
231
        is_sample: bool,
×
UNCOV
232
        allowed_cascading: usize,
×
UNCOV
233
        _excludes: &[StringCode],
×
UNCOV
234
    ) -> VortexResult<ArrayRef> {
×
UNCOV
235
        let compressor = fsst_train_compressor(&stats.src.clone().into_array())?;
×
UNCOV
236
        let fsst = fsst_compress(&stats.src.clone().into_array(), &compressor)?;
×
237

UNCOV
238
        let compressed_original_lengths = IntCompressor::compress(
×
UNCOV
239
            &fsst.uncompressed_lengths().to_primitive()?,
×
UNCOV
240
            is_sample,
×
UNCOV
241
            allowed_cascading,
×
UNCOV
242
            &[],
×
243
        )?;
×
244

245
        // We compress the var bin offsets of the FSST codes array.
UNCOV
246
        let compressed_codes_offsets = IntCompressor::compress(
×
UNCOV
247
            &fsst.codes().offsets().to_primitive()?,
×
UNCOV
248
            is_sample,
×
UNCOV
249
            allowed_cascading,
×
UNCOV
250
            &[],
×
251
        )?;
×
UNCOV
252
        let compressed_codes = VarBinArray::try_new(
×
UNCOV
253
            compressed_codes_offsets,
×
UNCOV
254
            fsst.codes().bytes().clone(),
×
UNCOV
255
            fsst.codes().dtype().clone(),
×
UNCOV
256
            fsst.codes().validity().clone(),
×
257
        )?;
×
258

UNCOV
259
        let fsst = FSSTArray::try_new(
×
UNCOV
260
            fsst.dtype().clone(),
×
UNCOV
261
            fsst.symbols().clone(),
×
UNCOV
262
            fsst.symbol_lengths().clone(),
×
UNCOV
263
            compressed_codes,
×
UNCOV
264
            compressed_original_lengths,
×
265
        )?;
×
266

UNCOV
267
        Ok(fsst.into_array())
×
UNCOV
268
    }
×
269
}
270

271
#[cfg(test)]
272
mod tests {
273
    use vortex_array::arrays::VarBinViewArray;
274
    use vortex_dtype::{DType, Nullability};
275

276
    use crate::Compressor;
277
    use crate::string::StringCompressor;
278

279
    #[test]
280
    fn test_strings() {
281
        let mut strings = Vec::new();
282
        for _ in 0..1024 {
283
            strings.push(Some("hello-world-1234"));
284
        }
285
        for _ in 0..1024 {
286
            strings.push(Some("hello-world-56789"));
287
        }
288
        let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable));
289

290
        println!("original array: {}", strings.as_ref().display_tree());
291

292
        let compressed = StringCompressor::compress(&strings, false, 3, &[]).unwrap();
293

294
        println!("compression tree: {}", compressed.display_tree());
295
    }
296
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc