• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 13958601093

19 Mar 2025 04:17PM UTC coverage: 74.164% (-1.5%) from 75.71%
13958601093

push

github

web-flow
Clean up properties docs (#6315)

58056 of 78281 relevant lines covered (74.16%)

819371.32 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.54
/components/segmenter/src/rule_segmenter.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::complex::ComplexPayloads;
6
use crate::indices::{Latin1Indices, Utf16Indices};
7
use crate::options::WordType;
8
use crate::provider::*;
9
use core::str::CharIndices;
10
use utf8_iter::Utf8CharIndices;
11

12
/// A trait allowing for RuleBreakIterator to be generalized to multiple string
13
/// encoding methods and granularity such as grapheme cluster, word, etc.
14
///
15
/// <div class="stab unstable">
16
/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
17
/// trait, please consider using a type from the implementors listed below.
18
/// </div>
19
pub trait RuleBreakType<'l, 's>: crate::private::Sealed {
20
    /// The iterator over characters.
21
    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
22

23
    /// The character type.
24
    type CharType: Copy + Into<u32> + core::fmt::Debug;
25

26
    #[doc(hidden)]
27
    fn get_current_position_character_len(iter: &RuleBreakIterator<'l, 's, Self>) -> usize;
28

29
    #[doc(hidden)]
30
    fn handle_complex_language(
31
        iter: &mut RuleBreakIterator<'l, 's, Self>,
32
        left_codepoint: Self::CharType,
33
    ) -> Option<usize>;
34
}
35

36
/// Implements the [`Iterator`] trait over the segmenter boundaries of the given string.
37
///
38
/// Lifetimes:
39
///
40
/// - `'l` = lifetime of the segmenter object from which this iterator was created
41
/// - `'s` = lifetime of the string being segmented
42
///
43
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
44
/// _after_ the boundary (for a boundary at the end of text, this index is the length
45
/// of the [`str`] or array of code units).
46
#[derive(Debug)]
47
pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
48
    pub(crate) iter: Y::IterAttr,
49
    pub(crate) len: usize,
50
    pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
51
    pub(crate) result_cache: alloc::vec::Vec<usize>,
52
    pub(crate) data: &'l RuleBreakData<'l>,
53
    pub(crate) complex: Option<&'l ComplexPayloads>,
54
    pub(crate) boundary_property: u8,
55
    pub(crate) locale_override: Option<&'l RuleBreakDataOverride<'l>>,
56
}
57

58
impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
59
    type Item = usize;
60

61
    fn next(&mut self) -> Option<Self::Item> {
91,082✔
62
        // If we have break point cache by previous run, return this result
63
        if let Some(&first_result) = self.result_cache.first() {
91,082✔
64
            let mut i = 0;
2✔
65
            loop {
2✔
66
                if i == first_result {
6✔
67
                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
×
68
                    return self.get_current_position();
×
69
                }
70
                i += Y::get_current_position_character_len(self);
6✔
71
                self.advance_iter();
6✔
72
                if self.is_eof() {
6✔
73
                    self.result_cache.clear();
2✔
74
                    self.boundary_property = self.data.complex_property;
2✔
75
                    return Some(self.len);
2✔
76
                }
77
            }
78
        }
79

80
        if self.is_eof() {
91,080✔
81
            self.advance_iter();
7,757✔
82
            if self.is_eof() && self.len == 0 {
7,757✔
83
                // Empty string. Since `self.current_pos_data` is always going to be empty,
84
                // we never read `self.len` except for here, so we can use it to mark that
85
                // we have already returned the single empty-string breakpoint.
86
                self.len = 1;
3✔
87
                return Some(0);
3✔
88
            }
89
            let Some(right_prop) = self.get_current_break_property() else {
7,754✔
90
                // iterator already reaches to EOT. Reset boundary property for word-like.
91
                self.boundary_property = 0;
3,862✔
92
                return None;
3,862✔
93
            };
94
            // SOT x anything
95
            if matches!(
3,892✔
96
                self.get_break_state_from_table(self.data.sot_property, right_prop),
3,892✔
97
                BreakState::Break | BreakState::NoMatch
98
            ) {
99
                self.boundary_property = 0; // SOT is special type
3,892✔
100
                return self.get_current_position();
3,892✔
101
            }
102
        }
103

104
        'a: loop {
105
            debug_assert!(!self.is_eof());
188,702✔
106
            let left_codepoint = self.get_current_codepoint()?;
279,784✔
107
            let left_prop = self.get_break_property(left_codepoint);
188,702✔
108
            self.advance_iter();
188,702✔
109

110
            let Some(right_prop) = self.get_current_break_property() else {
188,702✔
111
                self.boundary_property = left_prop;
3,665✔
112
                return Some(self.len);
3,665✔
113
            };
114

115
            // Some segmenter rules doesn't have language-specific rules, we have to use LSTM (or dictionary) segmenter.
116
            // If property is marked as SA, use it
117
            if right_prop == self.data.complex_property {
185,037✔
118
                if left_prop != self.data.complex_property {
2✔
119
                    // break before SA
120
                    self.boundary_property = left_prop;
×
121
                    return self.get_current_position();
×
122
                }
123
                let break_offset = Y::handle_complex_language(self, left_codepoint);
2✔
124
                if break_offset.is_some() {
2✔
125
                    return break_offset;
2✔
126
                }
127
            }
128

129
            match self.get_break_state_from_table(left_prop, right_prop) {
185,035✔
130
                BreakState::Keep => continue,
131
                BreakState::Break | BreakState::NoMatch => {
132
                    self.boundary_property = left_prop;
71,992✔
133
                    return self.get_current_position();
71,992✔
134
                }
135
                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
15,273✔
136
                    // This isn't simple rule set. We need marker to restore iterator to previous position.
137
                    let mut previous_iter = self.iter.clone();
15,273✔
138
                    let mut previous_pos_data = self.current_pos_data;
15,273✔
139
                    let mut previous_left_prop = left_prop;
15,273✔
140

141
                    loop {
15,273✔
142
                        self.advance_iter();
22,820✔
143

144
                        let Some(prop) = self.get_current_break_property() else {
22,810✔
145
                            // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
146
                            self.boundary_property = index;
226✔
147
                            if self.get_break_state_from_table(index, self.data.eot_property)
226✔
148
                                == BreakState::NoMatch
149
                            {
150
                                self.boundary_property = previous_left_prop;
16✔
151
                                self.iter = previous_iter;
16✔
152
                                self.current_pos_data = previous_pos_data;
16✔
153
                                return self.get_current_position();
16✔
154
                            }
155
                            // EOF
156
                            return Some(self.len);
210✔
157
                        };
158

159
                        let previous_break_state_is_cp_prop =
160
                            index <= self.data.last_codepoint_property;
22,592✔
161

162
                        match self.get_break_state_from_table(index, prop) {
22,584✔
163
                            BreakState::Keep => continue 'a,
164
                            BreakState::NoMatch => {
165
                                self.boundary_property = previous_left_prop;
962✔
166
                                self.iter = previous_iter;
962✔
167
                                self.current_pos_data = previous_pos_data;
962✔
168
                                return self.get_current_position();
962✔
169
                            }
170
                            BreakState::Break => return self.get_current_position(),
6,470✔
171
                            BreakState::Intermediate(i) => {
638✔
172
                                index = i;
638✔
173
                                if previous_break_state_is_cp_prop {
638✔
174
                                    // Move marker
175
                                    previous_left_prop = index;
94✔
176
                                }
177
                                previous_iter = self.iter.clone();
638✔
178
                                previous_pos_data = self.current_pos_data;
638✔
179
                            }
638✔
180
                            BreakState::Index(i) => {
6,909✔
181
                                index = i;
6,909✔
182
                                if previous_break_state_is_cp_prop {
8,603✔
183
                                    // Move marker
184
                                    previous_iter = self.iter.clone();
1,694✔
185
                                    previous_pos_data = self.current_pos_data;
1,694✔
186
                                    previous_left_prop = index;
1,694✔
187
                                }
188
                            }
189
                        }
190
                    }
191
                }
15,267✔
192
            }
193
        }
194
    }
91,076✔
195
}
196

197
impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {
198
    pub(crate) fn advance_iter(&mut self) {
242,417✔
199
        self.current_pos_data = self.iter.next();
242,417✔
200
    }
242,417✔
201

202
    pub(crate) fn is_eof(&self) -> bool {
322,478✔
203
        self.current_pos_data.is_none()
322,478✔
204
    }
322,478✔
205

206
    pub(crate) fn get_current_break_property(&self) -> Option<u8> {
242,156✔
207
        self.get_current_codepoint()
242,156✔
208
            .map(|c| self.get_break_property(c))
226,253✔
209
    }
242,156✔
210

211
    pub(crate) fn get_current_position(&self) -> Option<usize> {
92,479✔
212
        self.current_pos_data.map(|(pos, _)| pos)
184,941✔
213
    }
92,479✔
214

215
    pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
441,581✔
216
        self.current_pos_data.map(|(_, codepoint)| codepoint)
867,197✔
217
    }
441,581✔
218

219
    fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
424,851✔
220
        // Note: Default value is 0 == UNKNOWN
221
        if let Some(locale_override) = &self.locale_override {
424,851✔
222
            let property = locale_override
47,648✔
223
                .property_table_override
224
                .get32(codepoint.into());
23,824✔
225
            if property != 0 {
23,824✔
226
                return property;
1,739✔
227
            }
228
        }
229
        self.data.property_table.get32(codepoint.into())
423,112✔
230
    }
424,851✔
231

232
    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
211,612✔
233
        let idx = left as usize * self.data.property_count as usize + right as usize;
211,612✔
234
        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
235
        self.data
423,224✔
236
            .break_state_table
237
            .get(idx)
238
            .unwrap_or(BreakState::Keep)
211,612✔
239
    }
211,612✔
240

241
    /// Return the status value of break boundary.
242
    /// If segmenter isn't word, always return WordType::None
243
    pub fn word_type(&self) -> WordType {
×
244
        if !self.result_cache.is_empty() {
×
245
            // Dictionary type (CJ and East Asian) is letter.
246
            return WordType::Letter;
×
247
        }
248
        if self.boundary_property == 0 {
×
249
            // break position is SOT / Any
250
            return WordType::None;
×
251
        }
252
        self.data
×
253
            .word_type_table
254
            .get((self.boundary_property - 1) as usize)
×
255
            .unwrap_or(WordType::None)
×
256
    }
×
257

258
    /// Return true when break boundary is word-like such as letter/number/CJK
259
    /// If segmenter isn't word, return false
260
    pub fn is_word_like(&self) -> bool {
261
        self.word_type().is_word_like()
262
    }
263
}
264

265
#[derive(Debug)]
×
266
pub struct RuleBreakTypeUtf8;
267

268
impl crate::private::Sealed for RuleBreakTypeUtf8 {}
269

270
impl<'s> RuleBreakType<'_, 's> for RuleBreakTypeUtf8 {
271
    type IterAttr = CharIndices<'s>;
272
    type CharType = char;
273

274
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
275
        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
×
276
    }
×
277

278
    fn handle_complex_language(
×
279
        _: &mut RuleBreakIterator<Self>,
280
        _: Self::CharType,
281
    ) -> Option<usize> {
282
        unreachable!()
×
283
    }
284
}
285

286
#[derive(Debug)]
×
287
pub struct RuleBreakTypePotentiallyIllFormedUtf8;
288

289
impl crate::private::Sealed for RuleBreakTypePotentiallyIllFormedUtf8 {}
290

291
impl<'s> RuleBreakType<'_, 's> for RuleBreakTypePotentiallyIllFormedUtf8 {
292
    type IterAttr = Utf8CharIndices<'s>;
293
    type CharType = char;
294

295
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
296
        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
×
297
    }
×
298

299
    fn handle_complex_language(
×
300
        _: &mut RuleBreakIterator<Self>,
301
        _: Self::CharType,
302
    ) -> Option<usize> {
303
        unreachable!()
×
304
    }
305
}
306

307
#[derive(Debug)]
×
308
pub struct RuleBreakTypeLatin1;
309

310
impl crate::private::Sealed for RuleBreakTypeLatin1 {}
311

312
impl<'s> RuleBreakType<'_, 's> for RuleBreakTypeLatin1 {
313
    type IterAttr = Latin1Indices<'s>;
314
    type CharType = u8;
315

316
    fn get_current_position_character_len(_: &RuleBreakIterator<Self>) -> usize {
×
317
        unreachable!()
×
318
    }
319

320
    fn handle_complex_language(
×
321
        _: &mut RuleBreakIterator<Self>,
322
        _: Self::CharType,
323
    ) -> Option<usize> {
324
        unreachable!()
×
325
    }
326
}
327

328
#[derive(Debug)]
×
329
pub struct RuleBreakTypeUtf16;
330

331
impl crate::private::Sealed for RuleBreakTypeUtf16 {}
332

333
impl<'s> RuleBreakType<'_, 's> for RuleBreakTypeUtf16 {
334
    type IterAttr = Utf16Indices<'s>;
335
    type CharType = u32;
336

337
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
338
        match iter.get_current_codepoint() {
×
339
            None => 0,
×
340
            Some(ch) if ch >= 0x10000 => 2,
×
341
            _ => 1,
×
342
        }
343
    }
×
344

345
    fn handle_complex_language(
×
346
        _: &mut RuleBreakIterator<Self>,
347
        _: Self::CharType,
348
    ) -> Option<usize> {
349
        unreachable!()
×
350
    }
351
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc