• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16593537393

29 Jul 2025 10:28AM UTC coverage: 82.152% (-0.1%) from 82.255%
16593537393

Pull #4048

github

web-flow
Merge 57f185fe9 into 348079fc3
Pull Request #4048: fix: Forbid SequenceScheme for String dictionary codes as well

2 of 2 new or added lines in 2 files covered. (100.0%)

62 existing lines in 10 files now uncovered.

44289 of 53911 relevant lines covered (82.15%)

168428.29 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.27
/vortex-btrblocks/src/string.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
use vortex_array::arrays::{VarBinArray, VarBinViewArray, VarBinViewVTable};
5
use vortex_array::vtable::ValidityHelper;
6
use vortex_array::{ArrayRef, IntoArray, ToCanonical};
7
use vortex_dict::DictArray;
8
use vortex_dict::builders::dict_encode;
9
use vortex_error::{VortexExpect, VortexResult};
10
use vortex_fsst::{FSSTArray, fsst_compress, fsst_train_compressor};
11
use vortex_utils::aliases::hash_set::HashSet;
12

13
use crate::integer::IntCompressor;
14
use crate::sample::sample;
15
use crate::{
16
    Compressor, CompressorStats, GenerateStatsOptions, Scheme,
17
    estimate_compression_ratio_with_sampling,
18
};
19

20
#[derive(Clone, Debug)]
21
pub struct StringStats {
22
    src: VarBinViewArray,
23
    estimated_distinct_count: u32,
24
    value_count: u32,
25
    // null_count: u32,
26
}
27

28
/// Estimate the number of distinct strings in the var bin view array.
29
#[allow(clippy::cast_possible_truncation)]
30
fn estimate_distinct_count(strings: &VarBinViewArray) -> u32 {
4,812✔
31
    let views = strings.views();
4,812✔
32
    // Iterate the views. Two strings which are equal must have the same first 8-bytes.
33
    // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
34
    // share a 4-byte prefix and have the same length.
35
    let mut distinct = HashSet::with_capacity(views.len() / 2);
4,812✔
36
    views.iter().for_each(|&view| {
2,806,740✔
37
        let len_and_prefix = view.as_u128() as u64;
2,806,740✔
38
        distinct.insert(len_and_prefix);
2,806,740✔
39
    });
2,806,740✔
40

41
    distinct
4,812✔
42
        .len()
4,812✔
43
        .try_into()
4,812✔
44
        .vortex_expect("distinct count must fit in u32")
4,812✔
45
}
4,812✔
46

47
impl CompressorStats for StringStats {
48
    type ArrayVTable = VarBinViewVTable;
49

50
    fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self {
5,478✔
51
        let null_count = input
5,478✔
52
            .statistics()
5,478✔
53
            .compute_null_count()
5,478✔
54
            .vortex_expect("null count");
5,478✔
55
        let value_count = input.len() - null_count;
5,478✔
56
        let estimated_distinct = if opts.count_distinct_values {
5,478✔
57
            estimate_distinct_count(input)
4,812✔
58
        } else {
59
            u32::MAX
666✔
60
        };
61

62
        Self {
5,478✔
63
            src: input.clone(),
5,478✔
64
            value_count: value_count.try_into().vortex_expect("value_count"),
5,478✔
65
            estimated_distinct_count: estimated_distinct,
5,478✔
66
        }
5,478✔
67
    }
5,478✔
68

69
    fn source(&self) -> &VarBinViewArray {
7,751✔
70
        &self.src
7,751✔
71
    }
7,751✔
72

73
    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
2,739✔
74
        let sampled = sample(self.src.as_ref(), sample_size, sample_count)
2,739✔
75
            .to_varbinview()
2,739✔
76
            .vortex_expect("varbinview");
2,739✔
77

78
        Self::generate_opts(&sampled, opts)
2,739✔
79
    }
2,739✔
80
}
81

82
pub struct StringCompressor;
83

84
impl Compressor for StringCompressor {
85
    type ArrayVTable = VarBinViewVTable;
86
    type SchemeType = dyn StringScheme;
87
    type StatsType = StringStats;
88

89
    fn schemes() -> &'static [&'static Self::SchemeType] {
2,739✔
90
        &[&UncompressedScheme, &DictScheme, &FSSTScheme]
2,739✔
91
    }
2,739✔
92

93
    fn default_scheme() -> &'static Self::SchemeType {
1,232✔
94
        &UncompressedScheme
1,232✔
95
    }
1,232✔
96

97
    fn dict_scheme_code() -> StringCode {
2,739✔
98
        DICT_SCHEME
2,739✔
99
    }
2,739✔
100
}
101

102
pub trait StringScheme: Scheme<StatsType = StringStats, CodeType = StringCode> {}
103

104
impl<T> StringScheme for T where T: Scheme<StatsType = StringStats, CodeType = StringCode> {}
105

106
#[derive(Debug, Copy, Clone)]
107
pub struct UncompressedScheme;
108

109
#[derive(Debug, Copy, Clone)]
110
pub struct DictScheme;
111

112
#[derive(Debug, Copy, Clone)]
113
pub struct FSSTScheme;
114

115
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
116
pub struct StringCode(u8);
117

118
const UNCOMPRESSED_SCHEME: StringCode = StringCode(0);
119
const DICT_SCHEME: StringCode = StringCode(1);
120
const FSST_SCHEME: StringCode = StringCode(2);
121

122
impl Scheme for UncompressedScheme {
123
    type StatsType = StringStats;
124
    type CodeType = StringCode;
125

126
    fn code(&self) -> StringCode {
2,739✔
127
        UNCOMPRESSED_SCHEME
2,739✔
128
    }
2,739✔
129

130
    fn expected_compression_ratio(
2,739✔
131
        &self,
2,739✔
132
        _stats: &Self::StatsType,
2,739✔
133
        _is_sample: bool,
2,739✔
134
        _allowed_cascading: usize,
2,739✔
135
        _excludes: &[StringCode],
2,739✔
136
    ) -> VortexResult<f64> {
2,739✔
137
        Ok(1.0)
2,739✔
138
    }
2,739✔
139

140
    fn compress(
1,232✔
141
        &self,
1,232✔
142
        stats: &Self::StatsType,
1,232✔
143
        _is_sample: bool,
1,232✔
144
        _allowed_cascading: usize,
1,232✔
145
        _excludes: &[StringCode],
1,232✔
146
    ) -> VortexResult<ArrayRef> {
1,232✔
147
        Ok(stats.source().to_array())
1,232✔
148
    }
1,232✔
149
}
150

151
impl Scheme for DictScheme {
152
    type StatsType = StringStats;
153
    type CodeType = StringCode;
154

155
    fn code(&self) -> StringCode {
3,405✔
156
        DICT_SCHEME
3,405✔
157
    }
3,405✔
158

159
    fn expected_compression_ratio(
2,073✔
160
        &self,
2,073✔
161
        stats: &Self::StatsType,
2,073✔
162
        is_sample: bool,
2,073✔
163
        allowed_cascading: usize,
2,073✔
164
        excludes: &[StringCode],
2,073✔
165
    ) -> VortexResult<f64> {
2,073✔
166
        // If we don't have a sufficiently high number of distinct values, do not attempt Dict.
167
        if stats.estimated_distinct_count > stats.value_count / 2 {
2,073✔
168
            return Ok(0.0);
1,698✔
169
        }
375✔
170

171
        // If array is all null, do not attempt dict.
172
        if stats.value_count == 0 {
375✔
173
            return Ok(0.0);
×
174
        }
375✔
175

176
        estimate_compression_ratio_with_sampling(
375✔
177
            self,
375✔
178
            stats,
375✔
179
            is_sample,
375✔
180
            allowed_cascading,
375✔
181
            excludes,
375✔
182
        )
183
    }
2,073✔
184

185
    fn compress(
666✔
186
        &self,
666✔
187
        stats: &Self::StatsType,
666✔
188
        is_sample: bool,
666✔
189
        allowed_cascading: usize,
666✔
190
        _excludes: &[StringCode],
666✔
191
    ) -> VortexResult<ArrayRef> {
666✔
192
        let dict = dict_encode(&stats.source().clone().into_array())?;
666✔
193

194
        // If we are not allowed to cascade, do not attempt codes or values compression.
195
        if allowed_cascading == 0 {
666✔
196
            return Ok(dict.into_array());
×
197
        }
666✔
198

199
        // Find best compressor for codes and values separately
200
        let compressed_codes = IntCompressor::compress(
666✔
201
            &dict.codes().to_primitive()?,
666✔
202
            is_sample,
666✔
203
            allowed_cascading - 1,
666✔
204
            &[
666✔
205
                crate::integer::DictScheme.code(),
666✔
206
                crate::integer::SequenceScheme.code(),
666✔
207
            ],
666✔
UNCOV
208
        )?;
×
209

210
        // Attempt to compress the values with non-Dict compression.
211
        // Currently this will only be FSST.
212
        let compressed_values = StringCompressor::compress(
666✔
213
            &dict.values().to_varbinview()?,
666✔
214
            is_sample,
666✔
215
            allowed_cascading - 1,
666✔
216
            &[DictScheme.code()],
666✔
UNCOV
217
        )?;
×
218

219
        Ok(DictArray::try_new(compressed_codes, compressed_values)?.into_array())
666✔
220
    }
666✔
221
}
222

223
impl Scheme for FSSTScheme {
224
    type StatsType = StringStats;
225
    type CodeType = StringCode;
226

227
    fn code(&self) -> StringCode {
2,739✔
228
        FSST_SCHEME
2,739✔
229
    }
2,739✔
230

231
    fn compress(
3,955✔
232
        &self,
3,955✔
233
        stats: &Self::StatsType,
3,955✔
234
        is_sample: bool,
3,955✔
235
        allowed_cascading: usize,
3,955✔
236
        _excludes: &[StringCode],
3,955✔
237
    ) -> VortexResult<ArrayRef> {
3,955✔
238
        let compressor = fsst_train_compressor(&stats.src.clone().into_array())?;
3,955✔
239
        let fsst = fsst_compress(&stats.src.clone().into_array(), &compressor)?;
3,955✔
240

241
        let compressed_original_lengths = IntCompressor::compress(
3,955✔
242
            &fsst.uncompressed_lengths().to_primitive()?,
3,955✔
243
            is_sample,
3,955✔
244
            allowed_cascading,
3,955✔
245
            &[],
3,955✔
UNCOV
246
        )?;
×
247

248
        // We compress the var bin offsets of the FSST codes array.
249
        let compressed_codes_offsets = IntCompressor::compress(
3,955✔
250
            &fsst.codes().offsets().to_primitive()?,
3,955✔
251
            is_sample,
3,955✔
252
            allowed_cascading,
3,955✔
253
            &[],
3,955✔
UNCOV
254
        )?;
×
255
        let compressed_codes = VarBinArray::try_new(
3,955✔
256
            compressed_codes_offsets,
3,955✔
257
            fsst.codes().bytes().clone(),
3,955✔
258
            fsst.codes().dtype().clone(),
3,955✔
259
            fsst.codes().validity().clone(),
3,955✔
UNCOV
260
        )?;
×
261

262
        let fsst = FSSTArray::try_new(
3,955✔
263
            fsst.dtype().clone(),
3,955✔
264
            fsst.symbols().clone(),
3,955✔
265
            fsst.symbol_lengths().clone(),
3,955✔
266
            compressed_codes,
3,955✔
267
            compressed_original_lengths,
3,955✔
UNCOV
268
        )?;
×
269

270
        Ok(fsst.into_array())
3,955✔
271
    }
3,955✔
272
}
273

274
#[cfg(test)]
275
mod tests {
276
    use vortex_array::arrays::VarBinViewArray;
277
    use vortex_dtype::{DType, Nullability};
278

279
    use crate::Compressor;
280
    use crate::string::StringCompressor;
281

282
    #[test]
283
    fn test_strings() {
1✔
284
        let mut strings = Vec::new();
1✔
285
        for _ in 0..1024 {
1,025✔
286
            strings.push(Some("hello-world-1234"));
1,024✔
287
        }
1,024✔
288
        for _ in 0..1024 {
1,025✔
289
            strings.push(Some("hello-world-56789"));
1,024✔
290
        }
1,024✔
291
        let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable));
1✔
292

293
        println!("original array: {}", strings.as_ref().display_tree());
1✔
294

295
        let compressed = StringCompressor::compress(&strings, false, 3, &[]).unwrap();
1✔
296

297
        println!("compression tree: {}", compressed.display_tree());
1✔
298
    }
1✔
299
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc