• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 11904027177

19 Nov 2024 12:33AM UTC coverage: 75.477% (+0.3%) from 75.174%
11904027177

push

github

web-flow
Move DateTimePattern into pattern module (#5834)

#1317

Also removes `NeoNeverMarker` and fixes #5689

258 of 319 new or added lines in 6 files covered. (80.88%)

6967 existing lines in 278 files now uncovered.

54522 of 72237 relevant lines covered (75.48%)

655305.49 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.54
/components/segmenter/src/rule_segmenter.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::complex::ComplexPayloads;
6
use crate::indices::{Latin1Indices, Utf16Indices};
7
use crate::provider::*;
8
use crate::WordType;
9
use core::str::CharIndices;
10
use utf8_iter::Utf8CharIndices;
11

12
/// A trait allowing for RuleBreakIterator to be generalized to multiple string
13
/// encoding methods and granularity such as grapheme cluster, word, etc.
14
pub trait RuleBreakType<'l, 's> {
15
    /// The iterator over characters.
16
    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
17

18
    /// The character type.
19
    type CharType: Copy + Into<u32> + core::fmt::Debug;
20

21
    fn get_current_position_character_len(iter: &RuleBreakIterator<'l, 's, Self>) -> usize;
22

23
    fn handle_complex_language(
24
        iter: &mut RuleBreakIterator<'l, 's, Self>,
25
        left_codepoint: Self::CharType,
26
    ) -> Option<usize>;
27
}
28

29
/// Implements the [`Iterator`] trait over the segmenter boundaries of the given string.
30
///
31
/// Lifetimes:
32
///
33
/// - `'l` = lifetime of the segmenter object from which this iterator was created
34
/// - `'s` = lifetime of the string being segmented
35
///
36
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
37
/// _after_ the boundary (for a boundary at the end of text, this index is the length
38
/// of the [`str`] or array of code units).
39
#[derive(Debug)]
40
pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
41
    pub(crate) iter: Y::IterAttr,
42
    pub(crate) len: usize,
43
    pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
44
    pub(crate) result_cache: alloc::vec::Vec<usize>,
45
    pub(crate) data: &'l RuleBreakDataV2<'l>,
46
    pub(crate) complex: Option<&'l ComplexPayloads>,
47
    pub(crate) boundary_property: u8,
48
    pub(crate) locale_override: Option<&'l RuleBreakDataOverrideV1<'l>>,
49
}
50

51
impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
52
    type Item = usize;
53

54
    fn next(&mut self) -> Option<Self::Item> {
91,092✔
55
        // If we have break point cache by previous run, return this result
56
        if let Some(&first_result) = self.result_cache.first() {
91,092✔
57
            let mut i = 0;
2✔
58
            loop {
2✔
59
                if i == first_result {
6✔
60
                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
×
UNCOV
61
                    return self.get_current_position();
×
62
                }
63
                i += Y::get_current_position_character_len(self);
6✔
64
                self.advance_iter();
6✔
65
                if self.is_eof() {
6✔
66
                    self.result_cache.clear();
2✔
67
                    self.boundary_property = self.data.complex_property;
2✔
68
                    return Some(self.len);
2✔
69
                }
70
            }
71
        }
72

73
        if self.is_eof() {
91,090✔
74
            self.advance_iter();
7,760✔
75
            if self.is_eof() && self.len == 0 {
7,760✔
76
                // Empty string. Since `self.current_pos_data` is always going to be empty,
77
                // we never read `self.len` except for here, so we can use it to mark that
78
                // we have already returned the single empty-string breakpoint.
79
                self.len = 1;
3✔
80
                return Some(0);
3✔
81
            }
82
            let Some(right_prop) = self.get_current_break_property() else {
7,757✔
83
                // iterator already reaches to EOT. Reset boundary property for word-like.
84
                self.boundary_property = 0;
3,863✔
85
                return None;
3,863✔
86
            };
87
            // SOT x anything
88
            if matches!(
3,894✔
89
                self.get_break_state_from_table(self.data.sot_property, right_prop),
3,894✔
90
                BreakState::Break | BreakState::NoMatch
91
            ) {
92
                self.boundary_property = 0; // SOT is special type
3,894✔
93
                return self.get_current_position();
3,894✔
94
            }
95
        }
96

97
        'a: loop {
98
            debug_assert!(!self.is_eof());
188,531✔
99
            let left_codepoint = self.get_current_codepoint()?;
279,623✔
100
            let left_prop = self.get_break_property(left_codepoint);
188,531✔
101
            self.advance_iter();
188,531✔
102

103
            let Some(right_prop) = self.get_current_break_property() else {
188,531✔
104
                self.boundary_property = left_prop;
3,665✔
105
                return Some(self.len);
3,665✔
106
            };
107

108
            // Some segmenter rules doesn't have language-specific rules, we have to use LSTM (or dictionary) segmenter.
109
            // If property is marked as SA, use it
110
            if right_prop == self.data.complex_property {
184,866✔
111
                if left_prop != self.data.complex_property {
2✔
112
                    // break before SA
113
                    self.boundary_property = left_prop;
×
UNCOV
114
                    return self.get_current_position();
×
115
                }
116
                let break_offset = Y::handle_complex_language(self, left_codepoint);
2✔
117
                if break_offset.is_some() {
2✔
118
                    return break_offset;
2✔
119
                }
120
            }
121

122
            match self.get_break_state_from_table(left_prop, right_prop) {
184,864✔
123
                BreakState::Keep => continue,
124
                BreakState::Break | BreakState::NoMatch => {
125
                    self.boundary_property = left_prop;
71,983✔
126
                    return self.get_current_position();
71,983✔
127
                }
128
                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
15,290✔
129
                    // This isn't simple rule set. We need marker to restore iterator to previous position.
130
                    let mut previous_iter = self.iter.clone();
15,290✔
131
                    let mut previous_pos_data = self.current_pos_data;
15,290✔
132
                    let mut previous_left_prop = left_prop;
15,290✔
133

134
                    loop {
15,290✔
135
                        self.advance_iter();
22,836✔
136

137
                        let Some(prop) = self.get_current_break_property() else {
22,810✔
138
                            // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
139
                            self.boundary_property = index;
226✔
140
                            if self.get_break_state_from_table(index, self.data.eot_property)
226✔
141
                                == BreakState::NoMatch
142
                            {
143
                                self.boundary_property = previous_left_prop;
16✔
144
                                self.iter = previous_iter;
16✔
145
                                self.current_pos_data = previous_pos_data;
16✔
146
                                return self.get_current_position();
16✔
147
                            }
148
                            // EOF
149
                            return Some(self.len);
210✔
150
                        };
151

152
                        let previous_break_state_is_cp_prop =
153
                            index <= self.data.last_codepoint_property;
22,603✔
154

155
                        match self.get_break_state_from_table(index, prop) {
22,573✔
156
                            BreakState::Keep => continue 'a,
157
                            BreakState::NoMatch => {
158
                                self.boundary_property = previous_left_prop;
962✔
159
                                self.iter = previous_iter;
962✔
160
                                self.current_pos_data = previous_pos_data;
962✔
161
                                return self.get_current_position();
962✔
162
                            }
163
                            BreakState::Break => return self.get_current_position(),
6,470✔
164
                            BreakState::Intermediate(i) => {
638✔
165
                                index = i;
638✔
166
                                if previous_break_state_is_cp_prop {
638✔
167
                                    // Move marker
168
                                    previous_left_prop = index;
94✔
169
                                }
170
                                previous_iter = self.iter.clone();
638✔
171
                                previous_pos_data = self.current_pos_data;
638✔
172
                            }
638✔
173
                            BreakState::Index(i) => {
6,908✔
174
                                index = i;
6,908✔
175
                                if previous_break_state_is_cp_prop {
8,602✔
176
                                    // Move marker
177
                                    previous_iter = self.iter.clone();
1,694✔
178
                                    previous_pos_data = self.current_pos_data;
1,694✔
179
                                    previous_left_prop = index;
1,694✔
180
                                }
181
                            }
182
                        }
183
                    }
184
                }
15,268✔
185
            }
186
        }
187
    }
91,070✔
188
}
189

190
impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {
191
    pub(crate) fn advance_iter(&mut self) {
242,132✔
192
        self.current_pos_data = self.iter.next();
242,132✔
193
    }
242,132✔
194

195
    pub(crate) fn is_eof(&self) -> bool {
322,444✔
196
        self.current_pos_data.is_none()
322,444✔
197
    }
322,444✔
198

199
    pub(crate) fn get_current_break_property(&self) -> Option<u8> {
242,245✔
200
        self.get_current_codepoint()
242,245✔
201
            .map(|c| self.get_break_property(c))
225,936✔
202
    }
242,245✔
203

204
    pub(crate) fn get_current_position(&self) -> Option<usize> {
92,444✔
205
        self.current_pos_data.map(|(pos, _)| pos)
184,871✔
206
    }
92,444✔
207

208
    pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
441,006✔
209
        self.current_pos_data.map(|(_, codepoint)| codepoint)
865,683✔
210
    }
441,006✔
211

212
    fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
423,287✔
213
        // Note: Default value is 0 == UNKNOWN
214
        if let Some(locale_override) = &self.locale_override {
423,287✔
215
            let property = locale_override
47,648✔
216
                .property_table_override
217
                .get32(codepoint.into());
23,824✔
218
            if property != 0 {
23,824✔
219
                return property;
1,739✔
220
            }
221
        }
222
        self.data.property_table.get32(codepoint.into())
421,548✔
223
    }
423,287✔
224

225
    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
211,353✔
226
        let idx = left as usize * self.data.property_count as usize + right as usize;
211,353✔
227
        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
228
        self.data
422,706✔
229
            .break_state_table
230
            .get(idx)
231
            .unwrap_or(BreakState::Keep)
211,353✔
232
    }
211,353✔
233

234
    /// Return the status value of break boundary.
235
    /// If segmenter isn't word, always return WordType::None
236
    pub fn word_type(&self) -> WordType {
×
UNCOV
237
        if !self.result_cache.is_empty() {
×
238
            // Dictionary type (CJ and East Asian) is letter.
239
            return WordType::Letter;
×
240
        }
UNCOV
241
        if self.boundary_property == 0 {
×
242
            // break position is SOT / Any
UNCOV
243
            return WordType::None;
×
244
        }
UNCOV
245
        self.data
×
246
            .word_type_table
UNCOV
247
            .get((self.boundary_property - 1) as usize)
×
UNCOV
248
            .unwrap_or(WordType::None)
×
249
    }
×
250

251
    /// Return true when break boundary is word-like such as letter/number/CJK
252
    /// If segmenter isn't word, return false
253
    pub fn is_word_like(&self) -> bool {
254
        self.word_type().is_word_like()
255
    }
256
}
257

258
#[derive(Debug)]
×
259
pub struct RuleBreakTypeUtf8;
260

261
impl<'s> RuleBreakType<'_, 's> for RuleBreakTypeUtf8 {
262
    type IterAttr = CharIndices<'s>;
263
    type CharType = char;
264

UNCOV
265
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
UNCOV
266
        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
×
UNCOV
267
    }
×
268

UNCOV
269
    fn handle_complex_language(
×
270
        _: &mut RuleBreakIterator<Self>,
271
        _: Self::CharType,
272
    ) -> Option<usize> {
UNCOV
273
        unreachable!()
×
274
    }
275
}
276

277
#[derive(Debug)]
×
278
pub struct RuleBreakTypePotentiallyIllFormedUtf8;
279

280
impl<'s> RuleBreakType<'_, 's> for RuleBreakTypePotentiallyIllFormedUtf8 {
281
    type IterAttr = Utf8CharIndices<'s>;
282
    type CharType = char;
283

UNCOV
284
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
UNCOV
285
        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
×
UNCOV
286
    }
×
287

UNCOV
288
    fn handle_complex_language(
×
289
        _: &mut RuleBreakIterator<Self>,
290
        _: Self::CharType,
291
    ) -> Option<usize> {
UNCOV
292
        unreachable!()
×
293
    }
294
}
295

UNCOV
296
#[derive(Debug)]
×
297
pub struct RuleBreakTypeLatin1;
298

299
impl<'s> RuleBreakType<'_, 's> for RuleBreakTypeLatin1 {
300
    type IterAttr = Latin1Indices<'s>;
301
    type CharType = u8;
302

UNCOV
303
    fn get_current_position_character_len(_: &RuleBreakIterator<Self>) -> usize {
×
UNCOV
304
        unreachable!()
×
305
    }
306

UNCOV
307
    fn handle_complex_language(
×
308
        _: &mut RuleBreakIterator<Self>,
309
        _: Self::CharType,
310
    ) -> Option<usize> {
UNCOV
311
        unreachable!()
×
312
    }
313
}
314

315
#[derive(Debug)]
×
316
pub struct RuleBreakTypeUtf16;
317

318
impl<'s> RuleBreakType<'_, 's> for RuleBreakTypeUtf16 {
319
    type IterAttr = Utf16Indices<'s>;
320
    type CharType = u32;
321

UNCOV
322
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
UNCOV
323
        match iter.get_current_codepoint() {
×
UNCOV
324
            None => 0,
×
325
            Some(ch) if ch >= 0x10000 => 2,
×
UNCOV
326
            _ => 1,
×
327
        }
UNCOV
328
    }
×
329

UNCOV
330
    fn handle_complex_language(
×
331
        _: &mut RuleBreakIterator<Self>,
332
        _: Self::CharType,
333
    ) -> Option<usize> {
UNCOV
334
        unreachable!()
×
335
    }
336
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc