• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

vortex-data / vortex / 16331938722

16 Jul 2025 10:49PM UTC coverage: 80.702% (-0.9%) from 81.557%
16331938722

push

github

web-flow
feat: build with stable rust (#3881)

120 of 173 new or added lines in 28 files covered. (69.36%)

174 existing lines in 102 files now uncovered.

41861 of 51871 relevant lines covered (80.7%)

157487.71 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.21
/vortex-btrblocks/src/string.rs
1
// SPDX-License-Identifier: Apache-2.0
2
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3

4
use vortex_array::arrays::{VarBinArray, VarBinViewArray, VarBinViewVTable};
5
use vortex_array::vtable::ValidityHelper;
6
use vortex_array::{ArrayRef, IntoArray, ToCanonical};
7
use vortex_dict::DictArray;
8
use vortex_dict::builders::dict_encode;
9
use vortex_error::{VortexExpect, VortexResult};
10
use vortex_fsst::{FSSTArray, fsst_compress, fsst_train_compressor};
11
use vortex_utils::aliases::hash_set::HashSet;
12

13
use crate::integer::IntCompressor;
14
use crate::sample::sample;
15
use crate::{
16
    Compressor, CompressorStats, GenerateStatsOptions, Scheme,
17
    estimate_compression_ratio_with_sampling,
18
};
19

20
#[derive(Clone, Debug)]
21
pub struct StringStats {
22
    src: VarBinViewArray,
23
    estimated_distinct_count: u32,
24
    value_count: u32,
25
    // null_count: u32,
26
}
27

28
/// Estimate the number of distinct strings in the var bin view array.
29
#[allow(clippy::cast_possible_truncation)]
30
fn estimate_distinct_count(strings: &VarBinViewArray) -> u32 {
4,536✔
31
    let views = strings.views();
4,536✔
32
    // Iterate the views. Two strings which are equal must have the same first 8-bytes.
33
    // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all
34
    // share a 4-byte prefix and have the same length.
35
    let mut distinct = HashSet::with_capacity(views.len() / 2);
4,536✔
36
    views.iter().for_each(|&view| {
2,816,250✔
37
        let len_and_prefix = view.as_u128() as u64;
2,816,250✔
38
        distinct.insert(len_and_prefix);
2,816,250✔
39
    });
2,816,250✔
40

41
    distinct
4,536✔
42
        .len()
4,536✔
43
        .try_into()
4,536✔
44
        .vortex_expect("distinct count must fit in u32")
4,536✔
45
}
4,536✔
46

47
impl CompressorStats for StringStats {
48
    type ArrayVTable = VarBinViewVTable;
49

50
    fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self {
5,166✔
51
        let null_count = input
5,166✔
52
            .statistics()
5,166✔
53
            .compute_null_count()
5,166✔
54
            .vortex_expect("null count");
5,166✔
55
        let value_count = input.len() - null_count;
5,166✔
56
        let estimated_distinct = if opts.count_distinct_values {
5,166✔
57
            estimate_distinct_count(input)
4,536✔
58
        } else {
59
            u32::MAX
630✔
60
        };
61

62
        Self {
5,166✔
63
            src: input.clone(),
5,166✔
64
            value_count: value_count.try_into().vortex_expect("value_count"),
5,166✔
65
            // null_count: null_count.try_into().vortex_expect("null_count"),
5,166✔
66
            estimated_distinct_count: estimated_distinct,
5,166✔
67
        }
5,166✔
68
    }
5,166✔
69

70
    fn source(&self) -> &VarBinViewArray {
7,129✔
71
        &self.src
7,129✔
72
    }
7,129✔
73

74
    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
2,583✔
75
        let sampled = sample(self.src.as_ref(), sample_size, sample_count)
2,583✔
76
            .to_varbinview()
2,583✔
77
            .vortex_expect("varbinview");
2,583✔
78

79
        Self::generate_opts(&sampled, opts)
2,583✔
80
    }
2,583✔
81
}
82

83
pub struct StringCompressor;
84

85
impl Compressor for StringCompressor {
86
    type ArrayVTable = VarBinViewVTable;
87
    type SchemeType = dyn StringScheme;
88
    type StatsType = StringStats;
89

90
    fn schemes() -> &'static [&'static Self::SchemeType] {
2,583✔
91
        &[&UncompressedScheme, &DictScheme, &FSSTScheme]
2,583✔
92
    }
2,583✔
93

94
    fn default_scheme() -> &'static Self::SchemeType {
990✔
95
        &UncompressedScheme
990✔
96
    }
990✔
97

98
    fn dict_scheme_code() -> StringCode {
2,583✔
99
        DICT_SCHEME
2,583✔
100
    }
2,583✔
101
}
102

103
pub trait StringScheme: Scheme<StatsType = StringStats, CodeType = StringCode> {}
104

105
impl<T> StringScheme for T where T: Scheme<StatsType = StringStats, CodeType = StringCode> {}
106

107
#[derive(Debug, Copy, Clone)]
108
pub struct UncompressedScheme;
109

110
#[derive(Debug, Copy, Clone)]
111
pub struct DictScheme;
112

113
#[derive(Debug, Copy, Clone)]
114
pub struct FSSTScheme;
115

116
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
117
pub struct StringCode(u8);
118

119
const UNCOMPRESSED_SCHEME: StringCode = StringCode(0);
120
const DICT_SCHEME: StringCode = StringCode(1);
121
const FSST_SCHEME: StringCode = StringCode(2);
122

123
impl Scheme for UncompressedScheme {
124
    type StatsType = StringStats;
125
    type CodeType = StringCode;
126

127
    fn code(&self) -> StringCode {
2,583✔
128
        UNCOMPRESSED_SCHEME
2,583✔
129
    }
2,583✔
130

131
    fn expected_compression_ratio(
2,583✔
132
        &self,
2,583✔
133
        _stats: &Self::StatsType,
2,583✔
134
        _is_sample: bool,
2,583✔
135
        _allowed_cascading: usize,
2,583✔
136
        _excludes: &[StringCode],
2,583✔
137
    ) -> VortexResult<f64> {
2,583✔
138
        Ok(1.0)
2,583✔
139
    }
2,583✔
140

141
    fn compress(
990✔
142
        &self,
990✔
143
        stats: &Self::StatsType,
990✔
144
        _is_sample: bool,
990✔
145
        _allowed_cascading: usize,
990✔
146
        _excludes: &[StringCode],
990✔
147
    ) -> VortexResult<ArrayRef> {
990✔
148
        Ok(stats.source().to_array())
990✔
149
    }
990✔
150
}
151

152
impl Scheme for DictScheme {
153
    type StatsType = StringStats;
154
    type CodeType = StringCode;
155

156
    fn code(&self) -> StringCode {
3,213✔
157
        DICT_SCHEME
3,213✔
158
    }
3,213✔
159

160
    fn expected_compression_ratio(
1,953✔
161
        &self,
1,953✔
162
        stats: &Self::StatsType,
1,953✔
163
        is_sample: bool,
1,953✔
164
        allowed_cascading: usize,
1,953✔
165
        excludes: &[StringCode],
1,953✔
166
    ) -> VortexResult<f64> {
1,953✔
167
        // If we don't have a sufficiently high number of distinct values, do not attempt Dict.
168
        if stats.estimated_distinct_count > stats.value_count / 2 {
1,953✔
169
            return Ok(0.0);
1,610✔
170
        }
343✔
171

172
        // If array is all null, do not attempt dict.
173
        if stats.value_count == 0 {
343✔
174
            return Ok(0.0);
×
175
        }
343✔
176

177
        estimate_compression_ratio_with_sampling(
343✔
178
            self,
343✔
179
            stats,
343✔
180
            is_sample,
343✔
181
            allowed_cascading,
343✔
182
            excludes,
343✔
183
        )
184
    }
1,953✔
185

186
    fn compress(
630✔
187
        &self,
630✔
188
        stats: &Self::StatsType,
630✔
189
        is_sample: bool,
630✔
190
        allowed_cascading: usize,
630✔
191
        _excludes: &[StringCode],
630✔
192
    ) -> VortexResult<ArrayRef> {
630✔
193
        let dict = dict_encode(&stats.source().clone().into_array())?;
630✔
194

195
        // If we are not allowed to cascade, do not attempt codes or values compression.
196
        if allowed_cascading == 0 {
630✔
197
            return Ok(dict.into_array());
×
198
        }
630✔
199

200
        // Find best compressor for codes and values separately
201
        let compressed_codes = IntCompressor::compress(
630✔
202
            &dict.codes().to_primitive()?,
630✔
203
            is_sample,
630✔
204
            allowed_cascading - 1,
630✔
205
            &[crate::integer::DictScheme.code()],
630✔
206
        )?;
×
207

208
        // Attempt to compress the values with non-Dict compression.
209
        // Currently this will only be FSST.
210
        let compressed_values = StringCompressor::compress(
630✔
211
            &dict.values().to_varbinview()?,
630✔
212
            is_sample,
630✔
213
            allowed_cascading - 1,
630✔
214
            &[DictScheme.code()],
630✔
215
        )?;
×
216

217
        Ok(DictArray::try_new(compressed_codes, compressed_values)?.into_array())
630✔
218
    }
630✔
219
}
220

221
impl Scheme for FSSTScheme {
222
    type StatsType = StringStats;
223
    type CodeType = StringCode;
224

225
    fn code(&self) -> StringCode {
2,583✔
226
        FSST_SCHEME
2,583✔
227
    }
2,583✔
228

229
    fn compress(
3,889✔
230
        &self,
3,889✔
231
        stats: &Self::StatsType,
3,889✔
232
        is_sample: bool,
3,889✔
233
        allowed_cascading: usize,
3,889✔
234
        _excludes: &[StringCode],
3,889✔
235
    ) -> VortexResult<ArrayRef> {
3,889✔
236
        let compressor = fsst_train_compressor(&stats.src.clone().into_array())?;
3,889✔
237
        let fsst = fsst_compress(&stats.src.clone().into_array(), &compressor)?;
3,889✔
238

239
        let compressed_original_lengths = IntCompressor::compress(
3,889✔
240
            &fsst.uncompressed_lengths().to_primitive()?,
3,889✔
241
            is_sample,
3,889✔
242
            allowed_cascading,
3,889✔
243
            &[],
3,889✔
244
        )?;
×
245

246
        // We compress the var bin offsets of the FSST codes array.
247
        let compressed_codes_offsets = IntCompressor::compress(
3,889✔
248
            &fsst.codes().offsets().to_primitive()?,
3,889✔
249
            is_sample,
3,889✔
250
            allowed_cascading,
3,889✔
251
            &[],
3,889✔
252
        )?;
×
253
        let compressed_codes = VarBinArray::try_new(
3,889✔
254
            compressed_codes_offsets,
3,889✔
255
            fsst.codes().bytes().clone(),
3,889✔
256
            fsst.codes().dtype().clone(),
3,889✔
257
            fsst.codes().validity().clone(),
3,889✔
UNCOV
258
        )?;
×
259

260
        let fsst = FSSTArray::try_new(
3,889✔
261
            fsst.dtype().clone(),
3,889✔
262
            fsst.symbols().clone(),
3,889✔
263
            fsst.symbol_lengths().clone(),
3,889✔
264
            compressed_codes,
3,889✔
265
            compressed_original_lengths,
3,889✔
UNCOV
266
        )?;
×
267

268
        Ok(fsst.into_array())
3,889✔
269
    }
3,889✔
270
}
271

272
#[cfg(test)]
273
mod tests {
274
    use vortex_array::arrays::VarBinViewArray;
275
    use vortex_dtype::{DType, Nullability};
276

277
    use crate::Compressor;
278
    use crate::string::StringCompressor;
279

280
    #[test]
281
    fn test_strings() {
1✔
282
        let mut strings = Vec::new();
1✔
283
        for _ in 0..1024 {
1,025✔
284
            strings.push(Some("hello-world-1234"));
1,024✔
285
        }
1,024✔
286
        for _ in 0..1024 {
1,025✔
287
            strings.push(Some("hello-world-56789"));
1,024✔
288
        }
1,024✔
289
        let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable));
1✔
290

291
        println!("original array: {}", strings.as_ref().display_tree());
1✔
292

293
        let compressed = StringCompressor::compress(&strings, false, 3, &[]).unwrap();
1✔
294

295
        println!("compression tree: {}", compressed.display_tree());
1✔
296
    }
1✔
297
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc