• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16350502539

17 Jul 2025 04:22PM UTC coverage: 80.836% (-0.7%) from 81.557%
16350502539

Pull #3876

github

web-flow
Merge d8ff9e2c1 into d53d06603
Pull Request #3876: feat[layout]: replace register_splits with a layout splits stream

645 of 692 new or added lines in 17 files covered. (93.21%)

372 existing lines in 117 files now uncovered.

42316 of 52348 relevant lines covered (80.84%)

141734.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.21
/vortex-btrblocks/src/string.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
use vortex_array::arrays::{VarBinArray, VarBinViewArray, VarBinViewVTable};
5
use vortex_array::vtable::ValidityHelper;
6
use vortex_array::{ArrayRef, IntoArray, ToCanonical};
7
use vortex_dict::DictArray;
8
use vortex_dict::builders::dict_encode;
9
use vortex_error::{VortexExpect, VortexResult};
10
use vortex_fsst::{FSSTArray, fsst_compress, fsst_train_compressor};
11
use vortex_utils::aliases::hash_set::HashSet;
12

13
use crate::integer::IntCompressor;
14
use crate::sample::sample;
15
use crate::{
16
    Compressor, CompressorStats, GenerateStatsOptions, Scheme,
17
    estimate_compression_ratio_with_sampling,
18
};
19

20
#[derive(Clone, Debug)]
21
pub struct StringStats {
22
    src: VarBinViewArray,
23
    estimated_distinct_count: u32,
24
    value_count: u32,
25
    // null_count: u32,
26
}
27

28
/// Estimate the number of distinct strings in the var bin view array.
29
#[allow(clippy::cast_possible_truncation)]
30
fn estimate_distinct_count(strings: &VarBinViewArray) -> u32 {
4,650✔
31
    let views = strings.views();
4,650✔
32
    // Iterate the views. Two strings which are equal must have the same first 8-bytes.
33
    // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
34
    // share a 4-byte prefix and have the same length.
35
    let mut distinct = HashSet::with_capacity(views.len() / 2);
4,650✔
36
    views.iter().for_each(|&view| {
2,816,554✔
37
        let len_and_prefix = view.as_u128() as u64;
2,816,554✔
38
        distinct.insert(len_and_prefix);
2,816,554✔
39
    });
2,816,554✔
40

41
    distinct
4,650✔
42
        .len()
4,650✔
43
        .try_into()
4,650✔
44
        .vortex_expect("distinct count must fit in u32")
4,650✔
45
}
4,650✔
46

47
impl CompressorStats for StringStats {
48
    type ArrayVTable = VarBinViewVTable;
49

50
    fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self {
5,318✔
51
        let null_count = input
5,318✔
52
            .statistics()
5,318✔
53
            .compute_null_count()
5,318✔
54
            .vortex_expect("null count");
5,318✔
55
        let value_count = input.len() - null_count;
5,318✔
56
        let estimated_distinct = if opts.count_distinct_values {
5,318✔
57
            estimate_distinct_count(input)
4,650✔
58
        } else {
59
            u32::MAX
668✔
60
        };
61

62
        Self {
5,318✔
63
            src: input.clone(),
5,318✔
64
            value_count: value_count.try_into().vortex_expect("value_count"),
5,318✔
65
            // null_count: null_count.try_into().vortex_expect("null_count"),
5,318✔
66
            estimated_distinct_count: estimated_distinct,
5,318✔
67
        }
5,318✔
68
    }
5,318✔
69

70
    fn source(&self) -> &VarBinViewArray {
7,338✔
71
        &self.src
7,338✔
72
    }
7,338✔
73

74
    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
2,659✔
75
        let sampled = sample(self.src.as_ref(), sample_size, sample_count)
2,659✔
76
            .to_varbinview()
2,659✔
77
            .vortex_expect("varbinview");
2,659✔
78

79
        Self::generate_opts(&sampled, opts)
2,659✔
80
    }
2,659✔
81
}
82

83
pub struct StringCompressor;
84

85
impl Compressor for StringCompressor {
86
    type ArrayVTable = VarBinViewVTable;
87
    type SchemeType = dyn StringScheme;
88
    type StatsType = StringStats;
89

90
    fn schemes() -> &'static [&'static Self::SchemeType] {
2,659✔
91
        &[&UncompressedScheme, &DictScheme, &FSSTScheme]
2,659✔
92
    }
2,659✔
93

94
    fn default_scheme() -> &'static Self::SchemeType {
990✔
95
        &UncompressedScheme
990✔
96
    }
990✔
97

98
    fn dict_scheme_code() -> StringCode {
2,659✔
99
        DICT_SCHEME
2,659✔
100
    }
2,659✔
101
}
102

103
pub trait StringScheme: Scheme<StatsType = StringStats, CodeType = StringCode> {}
104

105
impl<T> StringScheme for T where T: Scheme<StatsType = StringStats, CodeType = StringCode> {}
106

107
#[derive(Debug, Copy, Clone)]
108
pub struct UncompressedScheme;
109

110
#[derive(Debug, Copy, Clone)]
111
pub struct DictScheme;
112

113
#[derive(Debug, Copy, Clone)]
114
pub struct FSSTScheme;
115

116
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
117
pub struct StringCode(u8);
118

119
const UNCOMPRESSED_SCHEME: StringCode = StringCode(0);
120
const DICT_SCHEME: StringCode = StringCode(1);
121
const FSST_SCHEME: StringCode = StringCode(2);
122

123
impl Scheme for UncompressedScheme {
124
    type StatsType = StringStats;
125
    type CodeType = StringCode;
126

127
    fn code(&self) -> StringCode {
2,659✔
128
        UNCOMPRESSED_SCHEME
2,659✔
129
    }
2,659✔
130

131
    fn expected_compression_ratio(
2,659✔
132
        &self,
2,659✔
133
        _stats: &Self::StatsType,
2,659✔
134
        _is_sample: bool,
2,659✔
135
        _allowed_cascading: usize,
2,659✔
136
        _excludes: &[StringCode],
2,659✔
137
    ) -> VortexResult<f64> {
2,659✔
138
        Ok(1.0)
2,659✔
139
    }
2,659✔
140

141
    fn compress(
990✔
142
        &self,
990✔
143
        stats: &Self::StatsType,
990✔
144
        _is_sample: bool,
990✔
145
        _allowed_cascading: usize,
990✔
146
        _excludes: &[StringCode],
990✔
147
    ) -> VortexResult<ArrayRef> {
990✔
148
        Ok(stats.source().to_array())
990✔
149
    }
990✔
150
}
151

152
impl Scheme for DictScheme {
153
    type StatsType = StringStats;
154
    type CodeType = StringCode;
155

156
    fn code(&self) -> StringCode {
3,327✔
157
        DICT_SCHEME
3,327✔
158
    }
3,327✔
159

160
    fn expected_compression_ratio(
1,991✔
161
        &self,
1,991✔
162
        stats: &Self::StatsType,
1,991✔
163
        is_sample: bool,
1,991✔
164
        allowed_cascading: usize,
1,991✔
165
        excludes: &[StringCode],
1,991✔
166
    ) -> VortexResult<f64> {
1,991✔
167
        // If we don't have a sufficiently high number of distinct values, do not attempt Dict.
168
        if stats.estimated_distinct_count > stats.value_count / 2 {
1,991✔
169
            return Ok(0.0);
1,629✔
170
        }
362✔
171

172
        // If array is all null, do not attempt dict.
173
        if stats.value_count == 0 {
362✔
174
            return Ok(0.0);
×
175
        }
362✔
176

177
        estimate_compression_ratio_with_sampling(
362✔
178
            self,
362✔
179
            stats,
362✔
180
            is_sample,
362✔
181
            allowed_cascading,
362✔
182
            excludes,
362✔
183
        )
184
    }
1,991✔
185

186
    fn compress(
668✔
187
        &self,
668✔
188
        stats: &Self::StatsType,
668✔
189
        is_sample: bool,
668✔
190
        allowed_cascading: usize,
668✔
191
        _excludes: &[StringCode],
668✔
192
    ) -> VortexResult<ArrayRef> {
668✔
193
        let dict = dict_encode(&stats.source().clone().into_array())?;
668✔
194

195
        // If we are not allowed to cascade, do not attempt codes or values compression.
196
        if allowed_cascading == 0 {
668✔
197
            return Ok(dict.into_array());
×
198
        }
668✔
199

200
        // Find best compressor for codes and values separately
201
        let compressed_codes = IntCompressor::compress(
668✔
202
            &dict.codes().to_primitive()?,
668✔
203
            is_sample,
668✔
204
            allowed_cascading - 1,
668✔
205
            &[crate::integer::DictScheme.code()],
668✔
206
        )?;
×
207

208
        // Attempt to compress the values with non-Dict compression.
209
        // Currently this will only be FSST.
210
        let compressed_values = StringCompressor::compress(
668✔
211
            &dict.values().to_varbinview()?,
668✔
212
            is_sample,
668✔
213
            allowed_cascading - 1,
668✔
214
            &[DictScheme.code()],
668✔
215
        )?;
×
216

217
        Ok(DictArray::try_new(compressed_codes, compressed_values)?.into_array())
668✔
218
    }
668✔
219
}
220

221
impl Scheme for FSSTScheme {
222
    type StatsType = StringStats;
223
    type CodeType = StringCode;
224

225
    fn code(&self) -> StringCode {
2,659✔
226
        FSST_SCHEME
2,659✔
227
    }
2,659✔
228

229
    fn compress(
4,022✔
230
        &self,
4,022✔
231
        stats: &Self::StatsType,
4,022✔
232
        is_sample: bool,
4,022✔
233
        allowed_cascading: usize,
4,022✔
234
        _excludes: &[StringCode],
4,022✔
235
    ) -> VortexResult<ArrayRef> {
4,022✔
236
        let compressor = fsst_train_compressor(&stats.src.clone().into_array())?;
4,022✔
237
        let fsst = fsst_compress(&stats.src.clone().into_array(), &compressor)?;
4,022✔
238

239
        let compressed_original_lengths = IntCompressor::compress(
4,022✔
240
            &fsst.uncompressed_lengths().to_primitive()?,
4,022✔
241
            is_sample,
4,022✔
242
            allowed_cascading,
4,022✔
243
            &[],
4,022✔
244
        )?;
×
245

246
        // We compress the var bin offsets of the FSST codes array.
247
        let compressed_codes_offsets = IntCompressor::compress(
4,022✔
248
            &fsst.codes().offsets().to_primitive()?,
4,022✔
249
            is_sample,
4,022✔
250
            allowed_cascading,
4,022✔
251
            &[],
4,022✔
252
        )?;
×
253
        let compressed_codes = VarBinArray::try_new(
4,022✔
254
            compressed_codes_offsets,
4,022✔
255
            fsst.codes().bytes().clone(),
4,022✔
256
            fsst.codes().dtype().clone(),
4,022✔
257
            fsst.codes().validity().clone(),
4,022✔
UNCOV
258
        )?;
×
259

260
        let fsst = FSSTArray::try_new(
4,022✔
261
            fsst.dtype().clone(),
4,022✔
262
            fsst.symbols().clone(),
4,022✔
263
            fsst.symbol_lengths().clone(),
4,022✔
264
            compressed_codes,
4,022✔
265
            compressed_original_lengths,
4,022✔
UNCOV
266
        )?;
×
267

268
        Ok(fsst.into_array())
4,022✔
269
    }
4,022✔
270
}
271

272
#[cfg(test)]
273
mod tests {
274
    use vortex_array::arrays::VarBinViewArray;
275
    use vortex_dtype::{DType, Nullability};
276

277
    use crate::Compressor;
278
    use crate::string::StringCompressor;
279

280
    #[test]
281
    fn test_strings() {
1✔
282
        let mut strings = Vec::new();
1✔
283
        for _ in 0..1024 {
1,025✔
284
            strings.push(Some("hello-world-1234"));
1,024✔
285
        }
1,024✔
286
        for _ in 0..1024 {
1,025✔
287
            strings.push(Some("hello-world-56789"));
1,024✔
288
        }
1,024✔
289
        let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable));
1✔
290

291
        println!("original array: {}", strings.as_ref().display_tree());
1✔
292

293
        let compressed = StringCompressor::compress(&strings, false, 3, &[]).unwrap();
1✔
294

295
        println!("compression tree: {}", compressed.display_tree());
1✔
296
    }
1✔
297
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc