• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 9207498870

23 May 2024 07:11AM UTC coverage: 76.113% (-0.3%) from 76.402%
9207498870

push

github

web-flow
Add to `IsoDurationParser` documentation in `ixdtf` (#4916)

53397 of 70155 relevant lines covered (76.11%)

514353.71 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.74
/components/segmenter/src/rule_segmenter.rs
1
// This file is part of ICU4X. For terms of use, please see the file
15,273✔
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::complex::ComplexPayloads;
6
use crate::indices::{Latin1Indices, Utf16Indices};
7
use crate::provider::*;
8
use crate::WordType;
9
use core::str::CharIndices;
10
use utf8_iter::Utf8CharIndices;
11

12
/// A trait allowing for RuleBreakIterator to be generalized to multiple string
13
/// encoding methods and granularity such as grapheme cluster, word, etc.
14
pub trait RuleBreakType<'l, 's> {
15
    /// The iterator over characters.
16
    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone + core::fmt::Debug;
17

18
    /// The character type.
19
    type CharType: Copy + Into<u32> + core::fmt::Debug;
20

21
    fn get_current_position_character_len(iter: &RuleBreakIterator<'l, 's, Self>) -> usize;
22

23
    fn handle_complex_language(
24
        iter: &mut RuleBreakIterator<'l, 's, Self>,
25
        left_codepoint: Self::CharType,
26
    ) -> Option<usize>;
27
}
28

29
/// Implements the [`Iterator`] trait over the segmenter boundaries of the given string.
30
///
31
/// Lifetimes:
32
///
33
/// - `'l` = lifetime of the segmenter object from which this iterator was created
34
/// - `'s` = lifetime of the string being segmented
35
///
36
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
37
/// _after_ the boundary (for a boundary at the end of text, this index is the length
38
/// of the [`str`] or array of code units).
39
#[derive(Debug)]
40
pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
41
    pub(crate) iter: Y::IterAttr,
42
    pub(crate) len: usize,
43
    pub(crate) current_pos_data: Option<(usize, Y::CharType)>,
44
    pub(crate) result_cache: alloc::vec::Vec<usize>,
45
    pub(crate) data: &'l RuleBreakDataV1<'l>,
46
    pub(crate) complex: Option<&'l ComplexPayloads>,
47
    pub(crate) boundary_property: u8,
48
}
49

50
impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
51
    type Item = usize;
52

53
    fn next(&mut self) -> Option<Self::Item> {
91,116✔
54
        // If we have break point cache by previous run, return this result
55
        if let Some(&first_result) = self.result_cache.first() {
91,116✔
56
            let mut i = 0;
2✔
57
            loop {
2✔
58
                if i == first_result {
6✔
59
                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
×
60
                    return self.get_current_position();
×
61
                }
62
                i += Y::get_current_position_character_len(self);
6✔
63
                self.advance_iter();
6✔
64
                if self.is_eof() {
6✔
65
                    self.result_cache.clear();
2✔
66
                    self.boundary_property = self.data.complex_property;
2✔
67
                    return Some(self.len);
2✔
68
                }
69
            }
70
        }
71

72
        if self.is_eof() {
91,114✔
73
            self.advance_iter();
7,756✔
74
            if self.is_eof() && self.len == 0 {
7,756✔
75
                // Empty string. Since `self.current_pos_data` is always going to be empty,
76
                // we never read `self.len` except for here, so we can use it to mark that
77
                // we have already returned the single empty-string breakpoint.
78
                self.len = 1;
3✔
79
                return Some(0);
3✔
80
            }
81
            let Some(right_prop) = self.get_current_break_property() else {
7,753✔
82
                // iterator already reaches to EOT. Reset boundary property for word-like.
83
                self.boundary_property = 0;
3,861✔
84
                return None;
3,861✔
85
            };
86
            // SOT x anything
87
            if matches!(
3,892✔
88
                self.get_break_state_from_table(self.data.sot_property, right_prop),
3,892✔
89
                BreakState::Break | BreakState::NoMatch
90
            ) {
91
                self.boundary_property = 0; // SOT is special type
3,892✔
92
                return self.get_current_position();
3,892✔
93
            }
94
        }
95

96
        'a: loop {
97
            debug_assert!(!self.is_eof());
189,053✔
98
            let left_codepoint = self.get_current_codepoint()?;
189,053✔
99
            let left_prop = self.get_break_property(left_codepoint);
189,053✔
100
            self.advance_iter();
189,053✔
101

102
            let Some(right_prop) = self.get_current_break_property() else {
189,053✔
103
                self.boundary_property = left_prop;
3,663✔
104
                return Some(self.len);
3,663✔
105
            };
106

107
            // Some segmenter rules doesn't have language-specific rules, we have to use LSTM (or dictionary) segmenter.
108
            // If property is marked as SA, use it
109
            if right_prop == self.data.complex_property {
185,390✔
110
                if left_prop != self.data.complex_property {
2✔
111
                    // break before SA
112
                    self.boundary_property = left_prop;
×
113
                    return self.get_current_position();
×
114
                }
115
                let break_offset = Y::handle_complex_language(self, left_codepoint);
2✔
116
                if break_offset.is_some() {
2✔
117
                    return break_offset;
2✔
118
                }
119
            }
120

121
            match self.get_break_state_from_table(left_prop, right_prop) {
185,388✔
122
                BreakState::Keep => continue,
123
                BreakState::Break | BreakState::NoMatch => {
124
                    self.boundary_property = left_prop;
72,026✔
125
                    return self.get_current_position();
72,026✔
126
                }
127
                BreakState::Index(mut index) | BreakState::Intermediate(mut index) => {
15,276✔
128
                    // This isn't simple rule set. We need marker to restore iterator to previous position.
129
                    let mut previous_iter = self.iter.clone();
15,276✔
130
                    let mut previous_pos_data = self.current_pos_data;
15,276✔
131
                    let mut previous_left_prop = left_prop;
15,276✔
132

133
                    loop {
15,276✔
134
                        self.advance_iter();
22,823✔
135

136
                        let Some(prop) = self.get_current_break_property() else {
22,814✔
137
                            // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
138
                            self.boundary_property = index;
226✔
139
                            if self.get_break_state_from_table(index, self.data.eot_property)
226✔
140
                                == BreakState::NoMatch
141
                            {
142
                                self.boundary_property = previous_left_prop;
16✔
143
                                self.iter = previous_iter;
16✔
144
                                self.current_pos_data = previous_pos_data;
16✔
145
                                return self.get_current_position();
16✔
146
                            }
147
                            // EOF
148
                            return Some(self.len);
210✔
149
                        };
150

151
                        let previous_break_state_is_cp_prop =
152
                            index <= self.data.last_codepoint_property;
22,591✔
153

154
                        match self.get_break_state_from_table(index, prop) {
22,583✔
155
                            BreakState::Keep => continue 'a,
156
                            BreakState::NoMatch => {
157
                                self.boundary_property = previous_left_prop;
962✔
158
                                self.iter = previous_iter;
962✔
159
                                self.current_pos_data = previous_pos_data;
962✔
160
                                return self.get_current_position();
962✔
161
                            }
162
                            BreakState::Break => return self.get_current_position(),
6,469✔
163
                            BreakState::Intermediate(i) => {
638✔
164
                                index = i;
638✔
165
                                if previous_break_state_is_cp_prop {
638✔
166
                                    // Move marker
167
                                    previous_left_prop = index;
94✔
168
                                }
169
                                previous_iter = self.iter.clone();
638✔
170
                                previous_pos_data = self.current_pos_data;
638✔
171
                            }
638✔
172
                            BreakState::Index(i) => {
6,909✔
173
                                index = i;
6,909✔
174
                                if previous_break_state_is_cp_prop {
8,603✔
175
                                    // Move marker
176
                                    previous_iter = self.iter.clone();
1,694✔
177
                                    previous_pos_data = self.current_pos_data;
1,694✔
178
                                    previous_left_prop = index;
1,694✔
179
                                }
180
                            }
181
                        }
182
                    }
183
                }
15,266✔
184
            }
185
        }
186
    }
91,106✔
187
}
188

189
impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {
190
    pub(crate) fn advance_iter(&mut self) {
242,959✔
191
        self.current_pos_data = self.iter.next();
242,959✔
192
    }
242,959✔
193

194
    pub(crate) fn is_eof(&self) -> bool {
322,721✔
195
        self.current_pos_data.is_none()
322,721✔
196
    }
322,721✔
197

198
    pub(crate) fn get_current_break_property(&self) -> Option<u8> {
242,598✔
199
        self.get_current_codepoint()
485,196✔
200
            .map(|c| self.get_break_property(c))
469,060✔
201
    }
242,598✔
202

203
    pub(crate) fn get_current_position(&self) -> Option<usize> {
92,515✔
204
        self.current_pos_data.map(|(pos, _)| pos)
185,022✔
205
    }
92,515✔
206

207
    pub(crate) fn get_current_codepoint(&self) -> Option<Y::CharType> {
442,031✔
208
        self.current_pos_data.map(|(_, codepoint)| codepoint)
867,932✔
209
    }
442,031✔
210

211
    fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
424,757✔
212
        // Note: Default value is 0 == UNKNOWN
213
        self.data.property_table.get32(codepoint.into())
424,757✔
214
    }
424,757✔
215

216
    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
212,008✔
217
        let idx = left as usize * self.data.property_count as usize + right as usize;
212,008✔
218
        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
219
        self.data
424,016✔
220
            .break_state_table
221
            .get(idx)
222
            .unwrap_or(BreakState::Keep)
212,008✔
223
    }
212,008✔
224

225
    /// Return the status value of break boundary.
226
    /// If segmenter isn't word, always return WordType::None
227
    pub fn word_type(&self) -> WordType {
×
228
        if self.result_cache.first().is_some() {
×
229
            // Dictionary type (CJ and East Asian) is letter.
230
            return WordType::Letter;
×
231
        }
232
        if self.boundary_property == 0 {
×
233
            // break position is SOT / Any
234
            return WordType::None;
×
235
        }
236
        self.data
×
237
            .word_type_table
238
            .get((self.boundary_property - 1) as usize)
×
239
            .unwrap_or(WordType::None)
×
240
    }
×
241

242
    /// Return true when break boundary is word-like such as letter/number/CJK
243
    /// If segmenter isn't word, return false
244
    pub fn is_word_like(&self) -> bool {
245
        self.word_type().is_word_like()
246
    }
247
}
248

249
#[derive(Debug)]
×
250
pub struct RuleBreakTypeUtf8;
251

252
impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf8 {
253
    type IterAttr = CharIndices<'s>;
254
    type CharType = char;
255

256
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
257
        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
×
258
    }
×
259

260
    fn handle_complex_language(
×
261
        _: &mut RuleBreakIterator<Self>,
262
        _: Self::CharType,
263
    ) -> Option<usize> {
264
        unreachable!()
×
265
    }
266
}
267

268
#[derive(Debug)]
×
269
pub struct RuleBreakTypePotentiallyIllFormedUtf8;
270

271
impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypePotentiallyIllFormedUtf8 {
272
    type IterAttr = Utf8CharIndices<'s>;
273
    type CharType = char;
274

275
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
276
        iter.get_current_codepoint().map_or(0, |c| c.len_utf8())
×
277
    }
×
278

279
    fn handle_complex_language(
×
280
        _: &mut RuleBreakIterator<Self>,
281
        _: Self::CharType,
282
    ) -> Option<usize> {
283
        unreachable!()
×
284
    }
285
}
286

287
#[derive(Debug)]
×
288
pub struct RuleBreakTypeLatin1;
289

290
impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeLatin1 {
291
    type IterAttr = Latin1Indices<'s>;
292
    type CharType = u8;
293

294
    fn get_current_position_character_len(_: &RuleBreakIterator<Self>) -> usize {
×
295
        unreachable!()
×
296
    }
297

298
    fn handle_complex_language(
×
299
        _: &mut RuleBreakIterator<Self>,
300
        _: Self::CharType,
301
    ) -> Option<usize> {
302
        unreachable!()
×
303
    }
304
}
305

306
#[derive(Debug)]
×
307
pub struct RuleBreakTypeUtf16;
308

309
impl<'l, 's> RuleBreakType<'l, 's> for RuleBreakTypeUtf16 {
310
    type IterAttr = Utf16Indices<'s>;
311
    type CharType = u32;
312

313
    fn get_current_position_character_len(iter: &RuleBreakIterator<Self>) -> usize {
×
314
        match iter.get_current_codepoint() {
×
315
            None => 0,
×
316
            Some(ch) if ch >= 0x10000 => 2,
×
317
            _ => 1,
×
318
        }
319
    }
×
320

321
    fn handle_complex_language(
×
322
        _: &mut RuleBreakIterator<Self>,
323
        _: Self::CharType,
324
    ) -> Option<usize> {
325
        unreachable!()
×
326
    }
327
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc