• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 6815798908

09 Nov 2023 05:17PM CUT coverage: 72.607% (-2.4%) from 75.01%
6815798908

push

github

web-flow
Implement `Any/BufferProvider` for some smart pointers (#4255)

Allows storing them as a `Box<dyn Any/BufferProvider>` without using a
wrapper type that implements the trait.

44281 of 60987 relevant lines covered (72.61%)

201375.86 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

70.0
/components/segmenter/src/rule_segmenter.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2,640✔
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::complex::ComplexPayloads;
6
use crate::indices::{Latin1Indices, Utf16Indices};
7
use crate::provider::RuleBreakDataV1;
8
use crate::symbols::*;
9
use core::str::CharIndices;
10
use utf8_iter::Utf8CharIndices;
11

12
/// The category tag that is returned by
13
/// [`WordBreakIterator::word_type()`][crate::WordBreakIterator::word_type()].
14
#[non_exhaustive]
15
#[derive(Copy, Clone, PartialEq, Debug)]
15✔
16
#[repr(u8)]
17
pub enum RuleStatusType {
18
    /// No category tag
19
    None = 0,
20
    /// Number category tag
21
    Number = 1,
22
    /// Letter category tag, including CJK.
23
    Letter = 2,
24
}
25

26
/// A trait allowing for RuleBreakIterator to be generalized to multiple string
27
/// encoding methods and granularity such as grapheme cluster, word, etc.
28
pub trait RuleBreakType<'l, 's> {
29
    /// The iterator over characters.
30
    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
31

32
    /// The character type.
33
    type CharType: Copy + Into<u32> + core::fmt::Debug;
34

35
    fn get_current_position_character_len(iter: &RuleBreakIterator<'l, 's, Self>) -> usize;
36

37
    fn handle_complex_language(
38
        iter: &mut RuleBreakIterator<'l, 's, Self>,
39
        left_codepoint: Self::CharType,
40
    ) -> Option<usize>;
41
}
42

43
/// Implements the [`Iterator`] trait over the segmenter boundaries of the given string.
44
///
45
/// Lifetimes:
46
///
47
/// - `'l` = lifetime of the segmenter object from which this iterator was created
48
/// - `'s` = lifetime of the string being segmented
49
///
50
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
51
/// _after_ the boundary (for a boundary at the end of text, this index is the length
52
/// of the [`str`] or array of code units).
53
#[derive(Debug)]
54
pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
55
    pub(crate) iter: Y::IterAttr,
56
    pub(crate) len: usize,
57
    pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
58
    pub(crate) result_cache: alloc::vec::Vec<usize>,
59
    pub(crate) data: &'l RuleBreakDataV1<'l>,
60
    pub(crate) complex: Option<&'l ComplexPayloads>,
61
    pub(crate) boundary_property: u8,
62
}
63

64
impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
65
    type Item = usize;
66

67
    fn next(&mut self) -> Option<Self::Item> {
8,515✔
68
        // If we have break point cache by previous run, return this result
69
        if let Some(&first_result) = self.result_cache.first() {
8,515✔
70
            let mut i = 0;
2✔
71
            loop {
2✔
72
                if i == first_result {
6✔
73
                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
×
74
                    return self.get_current_position();
×
75
                }
76
                i += Y::get_current_position_character_len(self);
6✔
77
                self.advance_iter();
6✔
78
                if self.is_eof() {
6✔
79
                    self.result_cache.clear();
2✔
80
                    return Some(self.len);
2✔
81
                }
82
            }
83
        }
84

85
        if self.is_eof() {
8,513✔
86
            self.advance_iter();
4,550✔
87
            if self.is_eof() && self.len == 0 {
4,550✔
88
                // Empty string. Since `self.current_pos_data` is always going to be empty,
89
                // we never read `self.len` except for here, so we can use it to mark that
90
                // we have already returned the single empty-string breakpoint.
91
                self.len = 1;
3✔
92
                return Some(0);
3✔
93
            }
94
            // SOT x anything
95
            let right_prop = self.get_current_break_property()?;
4,547✔
96
            if self.is_break_from_table(self.data.sot_property, right_prop) {
2,277✔
97
                self.boundary_property = 0; // SOT is special type
2,260✔
98
                return self.get_current_position();
2,260✔
99
            }
100
        }
101

102
        loop {
103
            debug_assert!(!self.is_eof());
6,323✔
104
            let left_codepoint = self.get_current_codepoint()?;
6,323✔
105
            let left_prop = self.get_break_property(left_codepoint);
6,323✔
106
            self.advance_iter();
6,323✔
107

108
            let Some(right_prop) = self.get_current_break_property() else {
6,323✔
109
                self.boundary_property = left_prop;
2,138✔
110
                return Some(self.len);
2,138✔
111
            };
112

113
            // Some segmenter rules doesn't have language-specific rules, we have to use LSTM (or dictionary) segmenter.
114
            // If property is marked as SA, use it
115
            if right_prop == self.data.complex_property {
4,185✔
116
                if left_prop != self.data.complex_property {
2✔
117
                    // break before SA
118
                    self.boundary_property = left_prop;
×
119
                    return self.get_current_position();
×
120
                }
121
                let break_offset = Y::handle_complex_language(self, left_codepoint);
2✔
122
                if break_offset.is_some() {
2✔
123
                    return break_offset;
2✔
124
                }
125
            }
126

127
            // If break_state is equals or grater than 0, it is alias of property.
128
            let mut break_state = self.get_break_state_from_table(left_prop, right_prop);
4,183✔
129

130
            if break_state >= 0 {
4,183✔
131
                // This isn't simple rule set. We need marker to restore iterator to previous position.
132
                let mut previous_iter = self.iter.clone();
619✔
133
                let mut previous_pos_data = self.current_pos_data;
619✔
134
                let mut previous_left_prop = left_prop;
619✔
135

136
                break_state &= !INTERMEDIATE_MATCH_RULE;
619✔
137
                loop {
619✔
138
                    self.advance_iter();
1,157✔
139

140
                    let Some(prop) = self.get_current_break_property() else {
1,155✔
141
                        // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
142
                        self.boundary_property = break_state as u8;
159✔
143
                        if self
159✔
144
                            .get_break_state_from_table(break_state as u8, self.data.eot_property)
159✔
145
                            == NOT_MATCH_RULE
146
                        {
147
                            self.boundary_property = previous_left_prop;
12✔
148
                            self.iter = previous_iter;
12✔
149
                            self.current_pos_data = previous_pos_data;
12✔
150
                            return self.get_current_position();
12✔
151
                        }
152
                        // EOF
153
                        return Some(self.len);
147✔
154
                    };
155

156
                    let previous_break_state = break_state;
997✔
157
                    break_state = self.get_break_state_from_table(break_state as u8, prop);
995✔
158
                    if break_state < 0 {
996✔
159
                        break;
160
                    }
161
                    if previous_break_state >= 0
692✔
162
                        && previous_break_state <= self.data.last_codepoint_property
538✔
163
                    {
164
                        // Move marker
165
                        previous_iter = self.iter.clone();
154✔
166
                        previous_pos_data = self.current_pos_data;
154✔
167
                        previous_left_prop = break_state as u8;
154✔
168
                    }
169
                    if (break_state & INTERMEDIATE_MATCH_RULE) != 0 {
560✔
170
                        break_state -= INTERMEDIATE_MATCH_RULE;
22✔
171
                        previous_iter = self.iter.clone();
22✔
172
                        previous_pos_data = self.current_pos_data;
22✔
173
                        previous_left_prop = break_state as u8;
22✔
174
                    }
175
                }
176
                if break_state == KEEP_RULE {
458✔
177
                    continue;
178
                }
179
                if break_state == NOT_MATCH_RULE {
124✔
180
                    self.boundary_property = previous_left_prop;
8✔
181
                    self.iter = previous_iter;
8✔
182
                    self.current_pos_data = previous_pos_data;
8✔
183
                    return self.get_current_position();
8✔
184
                }
185
                return self.get_current_position();
116✔
186
            }
617✔
187

188
            if self.is_break_from_table(left_prop, right_prop) {
3,564✔
189
                self.boundary_property = left_prop;
1,555✔
190
                return self.get_current_position();
1,555✔
191
            }
192
        }
193
    }
8,513✔
194
}
195

196
impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {
197
    pub(crate) fn advance_iter(&mut self) {
35,048✔
198
        self.current_pos_data = self.iter.next();
35,048✔
199
    }
35,048✔
200

201
    pub(crate) fn is_eof(&self) -> bool {
53,683✔
202
        self.current_pos_data.is_none()
53,683✔
203
    }
53,683✔
204

205
    pub(crate) fn get_current_break_property(&self) -> Option<u8> {
34,665✔
206
        self.get_current_codepoint()
69,330✔
207
            .map(|c| self.get_break_property(c))
56,593✔
208
    }
34,665✔
209

210
    pub(crate) fn get_current_position(&self) -> Option<usize> {
13,046✔
211
        self.current_pos_data.map(|(pos, _)| pos)
26,085✔
212
    }
13,046✔
213

214
    pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
51,619✔
215
        self.current_pos_data.map(|(_, codepoint)| codepoint)
90,560✔
216
    }
51,619✔
217

218
    fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
38,150✔
219
        // Note: Default value is 0 == UNKNOWN
220
        self.data.property_table.0.get32(codepoint.into())
38,150✔
221
    }
38,150✔
222

223
    fn get_break_state_from_table(&self, left: u8, right: u8) -> i8 {
11,147✔
224
        let idx = left as usize * self.data.property_count as usize + right as usize;
11,147✔
225
        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
226
        self.data.break_state_table.0.get(idx).unwrap_or(KEEP_RULE)
11,147✔
227
    }
11,147✔
228

229
    fn is_break_from_table(&self, left: u8, right: u8) -> bool {
5,836✔
230
        let rule = self.get_break_state_from_table(left, right);
5,836✔
231
        if rule == KEEP_RULE {
5,836✔
232
            return false;
2,016✔
233
        }
234
        if rule >= 0 {
3,820✔
235
            // need additional next characters to get break rule.
236
            return false;
×
237
        }
238
        true
3,820✔
239
    }
5,836✔
240

241
    /// Return the status value of break boundary.
242
    /// If segmenter isn't word, always return RuleStatusType::None
243
    pub fn rule_status(&self) -> RuleStatusType {
×
244
        if self.result_cache.first().is_some() {
×
245
            // Dictionary type (CJ and East Asian) is letter.
246
            return RuleStatusType::Letter;
×
247
        }
248
        if self.boundary_property == 0 {
×
249
            // break position is SOT / Any
250
            return RuleStatusType::None;
×
251
        }
252
        match self
×
253
            .data
254
            .rule_status_table
255
            .0
256
            .get((self.boundary_property - 1) as usize)
×
257
        {
258
            Some(1) => RuleStatusType::Number,
×
259
            Some(2) => RuleStatusType::Letter,
×
260
            _ => RuleStatusType::None,
×
261
        }
262
    }
×
263

264
    /// Return true when break boundary is word-like such as letter/number/CJK
265
    /// If segmenter isn't word, return false
266
    pub fn is_word_like(&self) -> bool {
×
267
        self.rule_status() != RuleStatusType::None
×
268
    }
×
269
}
270

271
#[derive(Debug)]
×
272
pub struct RuleBreakTypeUtf8;
273

274
impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf8 {
275
    type IterAttr = CharIndices<'s>;
276
    type CharType = char;
277

278
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
279
        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
×
280
    }
×
281

282
    fn handle_complex_language(
×
283
        _: &mut RuleBreakIterator<Self>,
284
        _: Self::CharType,
285
    ) -> Option<usize> {
286
        unreachable!()
×
287
    }
288
}
289

290
#[derive(Debug)]
×
291
pub struct RuleBreakTypePotentiallyIllFormedUtf8;
292

293
impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypePotentiallyIllFormedUtf8 {
294
    type IterAttr = Utf8CharIndices<'s>;
295
    type CharType = char;
296

297
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
298
        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
×
299
    }
×
300

301
    fn handle_complex_language(
×
302
        _: &mut RuleBreakIterator<Self>,
303
        _: Self::CharType,
304
    ) -> Option<usize> {
305
        unreachable!()
×
306
    }
307
}
308

309
#[derive(Debug)]
×
310
pub struct RuleBreakTypeLatin1;
311

312
impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeLatin1 {
313
    type IterAttr = Latin1Indices<'s>;
314
    type CharType = u8;
315

316
    fn get_current_position_character_len(_: &RuleBreakIterator<Self>) -> usize {
×
317
        unreachable!()
×
318
    }
319

320
    fn handle_complex_language(
×
321
        _: &mut RuleBreakIterator<Self>,
322
        _: Self::CharType,
323
    ) -> Option<usize> {
324
        unreachable!()
×
325
    }
326
}
327

328
#[derive(Debug)]
×
329
pub struct RuleBreakTypeUtf16;
330

331
impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf16 {
332
    type IterAttr = Utf16Indices<'s>;
333
    type CharType = u32;
334

335
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
336
        match iter.get_current_codepoint() {
×
337
            None => 0,
×
338
            Some(ch) if ch >= 0x10000 => 2,
×
339
            _ => 1,
×
340
        }
341
    }
×
342

343
    fn handle_complex_language(
×
344
        _: &mut RuleBreakIterator<Self>,
345
        _: Self::CharType,
346
    ) -> Option<usize> {
347
        unreachable!()
×
348
    }
349
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc