• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 6815798908

09 Nov 2023 05:17PM UTC coverage: 72.607% (-2.4%) from 75.01%
6815798908

push

github

web-flow
Implement `Any/BufferProvider` for some smart pointers (#4255)

Allows storing them as a `Box<dyn Any/BufferProvider>` without using a
wrapper type that implements the trait.

44281 of 60987 relevant lines covered (72.61%)

201375.86 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

72.17
/provider/datagen/src/transform/segmenter/mod.rs
1
// This file is part of ICU4X. For terms of use, please see the file
3✔
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
//! This module contains provider implementations backed by built-in segmentation data.
6

7
#![allow(dead_code)]
8
#![allow(unused_imports)]
9

10
use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData};
11
use icu_collections::codepointtrie::CodePointTrie;
12
use icu_properties::{
13
    maps, sets, EastAsianWidth, GeneralCategory, GraphemeClusterBreak, LineBreak, Script,
14
    SentenceBreak, WordBreak,
15
};
16
use icu_provider::datagen::IterableDataProvider;
17
use icu_provider::prelude::*;
18
use icu_segmenter::provider::*;
19
use icu_segmenter::symbols::*;
20
use std::fmt::Debug;
21
use zerovec::ZeroVec;
22

23
pub(crate) mod dictionary;
24
pub(crate) mod lstm;
25

26
// state machine name define by builtin name
27
// [[tables]]
28
// name = "Double_Quote"
29
//
30
// state machine define for combined state
31
// [[tables]]
32
// name = "Double_Quote_ALetter"
33
// left = "Double_Quote"
34
// right = "ALetter"
35
//
36
// state machine define using code point
37
// [[tables]]
38
// name = "ABC"
39
// codepoint = [32, 33, ...]
40
#[derive(serde::Deserialize, Debug)]
1,772✔
41
struct SegmenterProperty {
42
    name: String,
×
43
    // If codepoint is defined, this is custom define, not builtin define.
44
    codepoint: Option<Vec<u32>>,
×
45
    // If left and right are defined, this define is combined state.
46
    left: Option<String>,
×
47
    right: Option<String>,
×
48
    // This combine state is an intermediate match rule.
49
    interm_break_state: Option<bool>,
×
50
}
51

52
// state machine break result define
53
// The follow is "Double_Quote x Double_Quote".
54
// [[rules]]
55
// left = [ "Double_Qoute" ]
56
// right = [ "Double_Qoute" ]
57
// break_state = true # true if break opportunity.
58
#[derive(serde::Deserialize, Debug)]
1,028✔
59
struct SegmenterState {
60
    left: Vec<String>,
×
61
    right: Vec<String>,
×
62
    break_state: Option<bool>,
×
63
}
64

65
// rule based segmenter define
66
//
67
// segmenter_type: builtin type. word, sentence or grapheme.
68
// tables: state machine name defines.
69
// rules: state machine rules.
70
//
71
// segmenter_type = "word"
72
// [[tables]]
73
// ...
74
// [[rules]]
75
// ...
76
#[derive(serde::Deserialize, Debug)]
33✔
77
struct SegmenterRuleTable {
78
    segmenter_type: String,
×
79
    tables: Vec<SegmenterProperty>,
×
80
    rules: Vec<SegmenterState>,
×
81
}
82

83
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
84
impl crate::DatagenProvider {
85
    fn generate_rule_break_data(&self, rules: &str) -> RuleBreakDataV1<'static> {
1,267✔
86
        let segmenter: SegmenterRuleTable =
1,267✔
87
            toml::from_str(rules).expect("The data should be valid!");
1,267✔
88

89
        let data = maps::load_word_break(self).expect("The data should be valid!");
1,267✔
90
        let wb = data.as_borrowed();
1,267✔
91

92
        let data = maps::load_grapheme_cluster_break(self).expect("The data should be valid!");
3✔
93
        let gb = data.as_borrowed();
1,267✔
94

95
        let data = maps::load_sentence_break(self).expect("The data should be valid!");
3✔
96
        let sb = data.as_borrowed();
1,267✔
97

98
        let data = maps::load_line_break(self).expect("The data should be valid!");
3✔
99
        let lb = data.as_borrowed();
1,267✔
100

101
        let data = maps::load_east_asian_width(self).expect("The data should be valid!");
3✔
102
        let eaw = data.as_borrowed();
1,267✔
103

104
        let data = maps::load_general_category(self).expect("The data should be valid!");
3✔
105
        let gc = data.as_borrowed();
1,267✔
106

107
        let data = maps::load_script(self).expect("The data should be valid");
3✔
108
        let script = data.as_borrowed();
1,267✔
109

110
        let data = sets::load_extended_pictographic(self).expect("The data should be valid!");
3✔
111
        let extended_pictographic = data.as_borrowed();
1,267✔
112

113
        fn set_break_state(
14,168✔
114
            break_state_table: &mut [i8],
115
            property_length: usize,
116
            left_index: usize,
117
            right_index: usize,
118
            break_state: i8,
119
        ) {
120
            let index = left_index * property_length + right_index;
14,168✔
121
            if break_state_table[index] == UNKNOWN_RULE
37,225✔
122
                || break_state_table[index] == NOT_MATCH_RULE
11,137✔
123
            {
124
                break_state_table[index] = break_state;
11,920✔
125
            }
126
        }
14,168✔
127

128
        fn get_index_from_name(properties_names: &[String], s: &str) -> Option<usize> {
1,140✔
129
            properties_names.iter().position(|n| n.eq(s))
28,552✔
130
        }
1,140✔
131

132
        fn get_word_segmenter_value_from_name(name: &str) -> WordBreak {
×
133
            match name {
134
                "ALetter" => WordBreak::ALetter,
×
135
                "CR" => WordBreak::CR,
×
136
                "Double_Quote" => WordBreak::DoubleQuote,
×
137
                "Extend" => WordBreak::Extend,
×
138
                "ExtendNumLet" => WordBreak::ExtendNumLet,
×
139
                "Format" => WordBreak::Format,
×
140
                "Katakana" => WordBreak::Katakana,
×
141
                "Hebrew_Letter" => WordBreak::HebrewLetter,
×
142
                "LF" => WordBreak::LF,
×
143
                "MidLetter" => WordBreak::MidLetter,
×
144
                "MidNum" => WordBreak::MidNum,
×
145
                "MidNumLet" => WordBreak::MidNumLet,
×
146
                "Newline" => WordBreak::Newline,
×
147
                "Numeric" => WordBreak::Numeric,
×
148
                "Regional_Indicator" => WordBreak::RegionalIndicator,
×
149
                "Single_Quote" => WordBreak::SingleQuote,
×
150
                "WSegSpace" => WordBreak::WSegSpace,
×
151
                "ZWJ" => WordBreak::ZWJ,
×
152
                _ => {
153
                    panic!("Invalid property name")
×
154
                }
155
            }
156
        }
×
157

158
        fn get_grapheme_segmenter_value_from_name(name: &str) -> GraphemeClusterBreak {
26✔
159
            match name {
160
                "Control" => GraphemeClusterBreak::Control,
26✔
161
                "CR" => GraphemeClusterBreak::CR,
24✔
162
                "Extend" => GraphemeClusterBreak::Extend,
22✔
163
                "L" => GraphemeClusterBreak::L,
20✔
164
                "LF" => GraphemeClusterBreak::LF,
18✔
165
                "LV" => GraphemeClusterBreak::LV,
16✔
166
                "LVT" => GraphemeClusterBreak::LVT,
14✔
167
                "Prepend" => GraphemeClusterBreak::Prepend,
12✔
168
                "Regional_Indicator" => GraphemeClusterBreak::RegionalIndicator,
10✔
169
                "SpacingMark" => GraphemeClusterBreak::SpacingMark,
8✔
170
                "T" => GraphemeClusterBreak::T,
6✔
171
                "V" => GraphemeClusterBreak::V,
4✔
172
                "ZWJ" => GraphemeClusterBreak::ZWJ,
2✔
173
                _ => {
174
                    panic!("Invalid property name")
×
175
                }
176
            }
177
        }
26✔
178

179
        fn get_sentence_segmenter_value_from_name(name: &str) -> SentenceBreak {
×
180
            match name {
181
                "ATerm" => SentenceBreak::ATerm,
×
182
                "Close" => SentenceBreak::Close,
×
183
                "CR" => SentenceBreak::CR,
×
184
                "Extend" => SentenceBreak::Extend,
×
185
                "Format" => SentenceBreak::Format,
×
186
                "LF" => SentenceBreak::LF,
×
187
                "Lower" => SentenceBreak::Lower,
×
188
                "Numeric" => SentenceBreak::Numeric,
×
189
                "OLetter" => SentenceBreak::OLetter,
×
190
                "SContinue" => SentenceBreak::SContinue,
×
191
                "Sep" => SentenceBreak::Sep,
×
192
                "Sp" => SentenceBreak::Sp,
×
193
                "STerm" => SentenceBreak::STerm,
×
194
                "Upper" => SentenceBreak::Upper,
×
195
                _ => {
196
                    panic!("Invalid property name")
×
197
                }
198
            }
199
        }
×
200

201
        fn get_line_segmenter_value_from_name(name: &str) -> LineBreak {
42✔
202
            match name {
203
                "AI" => LineBreak::Ambiguous,
42✔
204
                "AL" => LineBreak::Alphabetic,
41✔
205
                "B2" => LineBreak::BreakBoth,
40✔
206
                "BA" => LineBreak::BreakAfter,
39✔
207
                "BB" => LineBreak::BreakBefore,
38✔
208
                "BK" => LineBreak::MandatoryBreak,
37✔
209
                "CB" => LineBreak::ContingentBreak,
36✔
210
                "CJ" => LineBreak::ConditionalJapaneseStarter,
35✔
211
                "CL" => LineBreak::ClosePunctuation,
34✔
212
                "CM" => LineBreak::CombiningMark,
33✔
213
                "CP" => LineBreak::CloseParenthesis,
32✔
214
                "CR" => LineBreak::CarriageReturn,
31✔
215
                "EB" => LineBreak::EBase,
30✔
216
                "EM" => LineBreak::EModifier,
29✔
217
                "EX" => LineBreak::Exclamation,
28✔
218
                "GL" => LineBreak::Glue,
27✔
219
                "H2" => LineBreak::H2,
26✔
220
                "H3" => LineBreak::H3,
25✔
221
                "HL" => LineBreak::HebrewLetter,
24✔
222
                "HY" => LineBreak::Hyphen,
23✔
223
                "ID" => LineBreak::Ideographic,
22✔
224
                "IN" => LineBreak::Inseparable,
21✔
225
                "IS" => LineBreak::InfixNumeric,
20✔
226
                "JL" => LineBreak::JL,
19✔
227
                "JT" => LineBreak::JT,
18✔
228
                "JV" => LineBreak::JV,
17✔
229
                "LF" => LineBreak::LineFeed,
16✔
230
                "NL" => LineBreak::NextLine,
15✔
231
                "NS" => LineBreak::Nonstarter,
14✔
232
                "NU" => LineBreak::Numeric,
13✔
233
                "OP" => LineBreak::OpenPunctuation,
12✔
234
                "PO" => LineBreak::PostfixNumeric,
12✔
235
                "PR" => LineBreak::PrefixNumeric,
11✔
236
                "QU" => LineBreak::Quotation,
10✔
237
                "RI" => LineBreak::RegionalIndicator,
9✔
238
                "SA" => LineBreak::ComplexContext,
8✔
239
                "SG" => LineBreak::Surrogate,
7✔
240
                "SP" => LineBreak::Space,
6✔
241
                "SY" => LineBreak::BreakSymbols,
5✔
242
                "WJ" => LineBreak::WordJoiner,
4✔
243
                "XX" => LineBreak::Unknown,
3✔
244
                "ZW" => LineBreak::ZWSpace,
2✔
245
                "ZWJ" => LineBreak::ZWJ,
1✔
246
                _ => {
247
                    panic!("Invalid property name: {name}")
×
248
                }
249
            }
250
        }
42✔
251

252
        fn is_cjk_fullwidth(
105✔
253
            eaw: maps::CodePointMapDataBorrowed<EastAsianWidth>,
254
            codepoint: u32,
255
        ) -> bool {
256
            matches!(
105✔
257
                eaw.get32(codepoint),
105✔
258
                EastAsianWidth::Ambiguous | EastAsianWidth::Fullwidth | EastAsianWidth::Wide
259
            )
260
        }
105✔
261

262
        // As of Unicode 14.0.0, the break property and the largest codepoint defined in UCD are
263
        // summarized in the following list. See details in the property txt in
264
        // https://www.unicode.org/Public/14.0.0/ucd/
265
        //
266
        // Line Break Property: U+E01EF ; CM [1]
267
        // Grapheme Break Property: U+E0FFF ; Control
268
        // Sentence Break Property: U+E01EF ; Extend
269
        // Word Break Property: U+E01EF ; Extend
270
        //
271
        // The table length should be large enough to contain all codepoints.
272
        //
273
        // [1] In LineBreak.txt, it defines F0000..FFFFD and 100000..10FFFD to be "XX", which are
274
        // the default unassigned values, so it's ok to omit them in the table.
275
        const CODEPOINT_TABLE_LEN: usize = 0xE1000;
276

277
        let mut properties_map = vec![0; CODEPOINT_TABLE_LEN];
2,531✔
278
        let mut properties_names = Vec::<String>::new();
1,267✔
279
        let mut simple_properties_count = 0;
1,267✔
280
        let mut rule_status_table = Vec::<u8>::new();
1,267✔
281

282
        properties_names.push("Unknown".to_string());
1,267✔
283
        simple_properties_count += 1;
3✔
284

285
        for p in &segmenter.tables {
205✔
286
            let property_index = if !properties_names.contains(&p.name) {
202✔
287
                properties_names.push(p.name.clone());
93✔
288
                (properties_names.len() - 1) as u8
93✔
289
            } else {
290
                continue;
291
            };
292

293
            if p.left.is_none() && p.right.is_none() && p.codepoint.is_none() {
93✔
294
                // If any values aren't set, this is builtin type.
295
                simple_properties_count += 1;
75✔
296

297
                match &*segmenter.segmenter_type {
75✔
298
                    "word" => {
75✔
299
                        // Extended_Pictographic isn't a part of word break property
300
                        // Extended pictographic property is within 0..U+0x20000
301
                        if p.name == "Extended_Pictographic" {
×
302
                            for i in 0..0x20000 {
×
303
                                if let Some(c) = char::from_u32(i) {
×
304
                                    if extended_pictographic.contains(c) {
×
305
                                        properties_map[c as usize] = property_index
×
306
                                    }
307
                                }
308
                            }
309
                            continue;
310
                        }
311

312
                        if p.name == "SA" {
×
313
                            // Word break property doesn't define SA, but we will use non-UAX29 rules.
314
                            // SA/CJ property is within 0..U+0x40000
315
                            for c in 0..0x40000 {
×
316
                                if lb.get32(c) == LineBreak::ComplexContext {
×
317
                                    properties_map[c as usize] = property_index
×
318
                                } else if let Some(c) = char::from_u32(c) {
×
319
                                    match script.get(c) {
×
320
                                        Script::Han | Script::Hiragana => {
×
321
                                            properties_map[c as usize] = property_index;
×
322
                                        }
323
                                        _ => {}
324
                                    }
325
                                }
326
                            }
327

328
                            continue;
329
                        }
330

331
                        // TODO(#2239):
332
                        // How to handle Katakana in UAX29? UAX29 defines Katakana rule, but CJ dictionary has another rules.
333
                        // Katakana will use UAX#29 rules instead of dictionary.
334

335
                        let prop = get_word_segmenter_value_from_name(&p.name);
×
336
                        for c in 0..(CODEPOINT_TABLE_LEN as u32) {
×
337
                            if wb.get32(c) == prop {
×
338
                                properties_map[c as usize] = property_index;
×
339
                            }
340
                        }
341
                        continue;
342
                    }
343

344
                    "grapheme" => {
75✔
345
                        // Extended_Pictographic isn't a part of grapheme break property
346
                        // Extended pictographic property is within 0..U+0x20000
347
                        if p.name == "Extended_Pictographic" {
28✔
348
                            for i in 0..0x20000 {
262,146✔
349
                                if let Some(c) = char::from_u32(i) {
262,144✔
350
                                    if extended_pictographic.contains(c) {
265,122✔
351
                                        properties_map[c as usize] = property_index
7,074✔
352
                                    }
353
                                }
354
                            }
355
                            continue;
356
                        }
357

358
                        let prop = get_grapheme_segmenter_value_from_name(&p.name);
26✔
359
                        for c in 0..(CODEPOINT_TABLE_LEN as u32) {
23,961,626✔
360
                            if gb.get32(c) == prop {
23,997,606✔
361
                                properties_map[c as usize] = property_index;
36,006✔
362
                            }
363
                        }
364
                        continue;
365
                    }
366

367
                    "sentence" => {
47✔
368
                        let prop = get_sentence_segmenter_value_from_name(&p.name);
×
369
                        for c in 0..(CODEPOINT_TABLE_LEN as u32) {
×
370
                            if sb.get32(c) == prop {
×
371
                                properties_map[c as usize] = property_index;
×
372
                            }
373
                        }
374
                        continue;
375
                    }
376

377
                    "line" => {
47✔
378
                        if p.name == "CP_EA"
272✔
379
                            || p.name == "OP_OP30"
47✔
380
                            || p.name == "OP_EA"
46✔
381
                            || p.name == "ID_CN"
45✔
382
                            || p.name == "PO_EAW"
44✔
383
                            || p.name == "PR_EAW"
43✔
384
                        {
385
                            for i in 0..(CODEPOINT_TABLE_LEN as u32) {
4,608,005✔
386
                                match lb.get32(i) {
4,608,000✔
387
                                    LineBreak::OpenPunctuation => {
388
                                        if (p.name == "OP_OP30"
1,074✔
389
                                            && (eaw.get32(i) != EastAsianWidth::Fullwidth
276✔
390
                                                && eaw.get32(i) != EastAsianWidth::Halfwidth
91✔
391
                                                && eaw.get32(i) != EastAsianWidth::Wide))
90✔
392
                                            || (p.name == "OP_EA"
504✔
393
                                                && (eaw.get32(i) == EastAsianWidth::Fullwidth
276✔
394
                                                    || eaw.get32(i) == EastAsianWidth::Halfwidth
91✔
395
                                                    || eaw.get32(i) == EastAsianWidth::Wide))
90✔
396
                                        {
397
                                            properties_map[i as usize] = property_index;
95✔
398
                                        }
399
                                    }
400

401
                                    LineBreak::CloseParenthesis => {
402
                                        // CP_EA is unused on the latest spec.
403
                                        if p.name == "CP_EA"
10✔
404
                                            && (eaw.get32(i) == EastAsianWidth::Fullwidth
×
405
                                                || eaw.get32(i) == EastAsianWidth::Halfwidth
×
406
                                                || eaw.get32(i) == EastAsianWidth::Wide)
×
407
                                        {
408
                                            properties_map[i as usize] = property_index;
×
409
                                        }
410
                                    }
411

412
                                    LineBreak::Ideographic => {
413
                                        if p.name == "ID_CN"
1,035,408✔
414
                                            && gc.get32(i) == GeneralCategory::Unassigned
172,568✔
415
                                        {
416
                                            if let Some(c) = char::from_u32(i) {
61,978✔
417
                                                if extended_pictographic.contains(c) {
63,474✔
418
                                                    properties_map[i as usize] = property_index;
1,496✔
419
                                                }
420
                                            }
421
                                        }
422
                                    }
423

424
                                    LineBreak::PostfixNumeric => {
425
                                        if p.name == "PO_EAW" && is_cjk_fullwidth(eaw, i) {
200✔
426
                                            properties_map[i as usize] = property_index;
10✔
427
                                        }
428
                                    }
429

430
                                    LineBreak::PrefixNumeric => {
431
                                        if p.name == "PR_EAW" && is_cjk_fullwidth(eaw, i) {
344✔
432
                                            properties_map[i as usize] = property_index;
9✔
433
                                        }
434
                                    }
435

436
                                    _ => {}
437
                                }
438
                            }
439
                            continue;
440
                        }
441

442
                        let prop = get_line_segmenter_value_from_name(&p.name);
42✔
443
                        for c in 0..(CODEPOINT_TABLE_LEN as u32) {
38,707,242✔
444
                            if lb.get32(c) == prop {
39,628,279✔
445
                                properties_map[c as usize] = property_index;
921,079✔
446
                            }
447
                        }
448
                        continue;
449
                    }
450

451
                    _ => {
452
                        panic!("unknown built-in segmenter type");
×
453
                    }
454
                }
455
            }
456

457
            if let Some(codepoint) = &p.codepoint {
18✔
458
                simple_properties_count += 1;
×
459
                for c in codepoint {
×
460
                    let c = *c as usize;
×
461
                    if c > CODEPOINT_TABLE_LEN {
×
462
                        continue;
463
                    }
464
                    properties_map[c] = property_index;
×
465
                }
466
            }
467
        }
468

469
        // sot and eot
470
        properties_names.push("sot".to_string());
3✔
471
        properties_names.push("eot".to_string());
3✔
472

473
        let rule_size = properties_names.len() * properties_names.len();
3✔
474
        let mut break_state_table = vec![UNKNOWN_RULE; rule_size];
3✔
475

476
        for rule in &segmenter.rules {
1,361✔
477
            let break_state = if let Some(state) = rule.break_state {
94✔
478
                if state {
92✔
479
                    BREAK_RULE
32✔
480
                } else {
481
                    KEEP_RULE
60✔
482
                }
483
            } else {
484
                NOT_MATCH_RULE
2✔
485
            };
486

487
            for l in &rule.left {
94✔
488
                if l == "Any" {
293✔
489
                    // Special case: left is Any
490
                    for r in &rule.right {
13✔
491
                        if r == "Any" {
24✔
492
                            // Fill all unknown state.
493
                            for item in break_state_table.iter_mut().take(rule_size) {
1,823✔
494
                                if *item == UNKNOWN_RULE {
1,820✔
495
                                    *item = break_state;
1,820✔
496
                                }
497
                            }
498
                        } else {
499
                            let right_index = get_index_from_name(&properties_names, r).unwrap();
21✔
500
                            for i in 0..simple_properties_count {
1,029✔
501
                                set_break_state(
1,008✔
502
                                    &mut break_state_table,
1,008✔
503
                                    properties_names.len(),
1,008✔
504
                                    i,
505
                                    right_index,
506
                                    break_state,
1,008✔
507
                                );
508
                            }
509
                        }
510
                    }
511
                    continue;
512
                }
513
                let left_index = get_index_from_name(&properties_names, l).unwrap();
280✔
514
                for r in &rule.right {
280✔
515
                    // Special case: right is Any
516
                    if r == "Any" {
608✔
517
                        for i in 0..properties_names.len() {
1,554✔
518
                            set_break_state(
1,528✔
519
                                &mut break_state_table,
1,528✔
520
                                properties_names.len(),
1,528✔
521
                                left_index,
522
                                i,
523
                                break_state,
1,528✔
524
                            );
525
                        }
526
                        continue;
527
                    }
528
                    let right_index = get_index_from_name(&properties_names, r).unwrap();
582✔
529
                    if r != "eot"
1,341✔
530
                        && break_state_table[left_index * properties_names.len() + right_index]
552✔
531
                            == NOT_MATCH_RULE
532
                    {
533
                        break_state_table[left_index * properties_names.len() + right_index] =
207✔
534
                            UNKNOWN_RULE;
535
                    }
536
                    set_break_state(
582✔
537
                        &mut break_state_table,
582✔
538
                        properties_names.len(),
582✔
539
                        left_index,
540
                        right_index,
541
                        break_state,
582✔
542
                    );
543
                    // Fill not match for combine state
544
                    for i in 0..properties_names.len() {
11,632✔
545
                        if left_index >= simple_properties_count {
11,050✔
546
                            set_break_state(
11,050✔
547
                                &mut break_state_table,
11,050✔
548
                                properties_names.len(),
11,050✔
549
                                left_index,
550
                                i,
551
                                NOT_MATCH_RULE,
552
                            );
553
                        }
554
                    }
555
                }
556
            }
557
        }
558

559
        let property_length = properties_names.len();
3✔
560

561
        // State machine alias
562
        for p in &segmenter.tables {
205✔
563
            if let Some(left) = &p.left {
202✔
564
                if let Some(right) = &p.right {
254✔
565
                    let right_index = get_index_from_name(&properties_names, right).unwrap();
127✔
566
                    let left_index = get_index_from_name(&properties_names, left).unwrap();
127✔
567
                    let interm_break_state = if p.interm_break_state.is_some() {
127✔
568
                        INTERMEDIATE_MATCH_RULE
×
569
                    } else {
570
                        0
127✔
571
                    };
572

573
                    let index = properties_names.iter().position(|n| n.eq(&p.name)).unwrap() as i8;
4,127✔
574
                    break_state_table[left_index * property_length + right_index] =
127✔
575
                        index | interm_break_state;
127✔
576
                }
577
            }
578
        }
579

580
        // Return 127 if the complex language isn't handled.
581
        let complex_property = get_index_from_name(&properties_names, "SA").unwrap_or(127);
3✔
582

583
        // Generate a CodePointTrie from properties_map
584
        let property_trie: CodePointTrie<u8> = CodePointTrieBuilder {
3✔
585
            data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map),
3✔
586
            default_value: 0,
587
            error_value: 0,
588
            trie_type: match self.trie_type() {
3✔
589
                crate::TrieType::Fast => icu_collections::codepointtrie::TrieType::Fast,
×
590
                crate::TrieType::Small => icu_collections::codepointtrie::TrieType::Small,
3✔
591
            },
592
        }
593
        .build();
3✔
594

595
        if segmenter.segmenter_type == "line" {
3✔
596
            // Note: The following match statement had been used in line.rs:
597
            //
598
            // match codepoint {
599
            //     0x20000..=0x2fffd => ID,
600
            //     0x30000..=0x3fffd => ID,
601
            //     0xe0001 => CM,
602
            //     0xe0020..=0xe007f => CM,
603
            //     0xe0100..=0xe01ef => CM,
604
            //     _ => XX,
605
            // }
606
            debug_assert_eq!(property_trie.get32(0x20000), ID);
1✔
607
            debug_assert_eq!(property_trie.get32(0x3fffd), ID);
1✔
608
            debug_assert_eq!(property_trie.get32(0xd0000), XX);
1✔
609
            debug_assert_eq!(property_trie.get32(0xe0001), CM);
1✔
610
            debug_assert_eq!(property_trie.get32(0xe0020), CM);
1✔
611
        }
612

613
        // rule status for word segmenter
614
        if segmenter.segmenter_type == "word" {
3✔
615
            for p in &segmenter.tables {
×
616
                let rule_state = match &*p.name {
×
617
                    "Numeric" => RuleStatusType::Number,
×
618
                    "ALetter" => RuleStatusType::Letter,
×
619
                    "Hebrew_Letter" => RuleStatusType::Letter,
×
620
                    "ExtendNumLet" => RuleStatusType::Letter,
×
621
                    "Katakana" => RuleStatusType::Letter,
×
622
                    "SA" => RuleStatusType::Letter,
×
623
                    _ => RuleStatusType::None,
×
624
                };
625
                rule_status_table.push(rule_state as u8);
×
626
            }
627
        }
628

629
        RuleBreakDataV1 {
3✔
630
            property_table: RuleBreakPropertyTable(property_trie),
3✔
631
            break_state_table: RuleBreakStateTable(ZeroVec::new_owned(break_state_table)),
3✔
632
            rule_status_table: RuleStatusTable(ZeroVec::new_owned(rule_status_table)),
3✔
633
            property_count: property_length as u8,
3✔
634
            last_codepoint_property: (simple_properties_count - 1) as i8,
3✔
635
            sot_property: (property_length - 2) as u8,
3✔
636
            eot_property: (property_length - 1) as u8,
3✔
637
            complex_property: complex_property as u8,
3✔
638
        }
×
639
    }
3✔
640
}
641

642
macro_rules! implement {
643
    ($marker:ident, $rules:literal) => {
644
        impl DataProvider<$marker> for crate::DatagenProvider {
645
            fn load(&self, req: DataRequest) -> Result<DataResponse<$marker>, DataError> {
3✔
646
                #[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
647
                return Err(DataError::custom(
648
                    "icu_datagen must be built with use_icu4c or use_wasm to build segmentation rules",
649
                )
650
                .with_req($marker::KEY, req));
651
                #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
652
                self.check_req::<$marker>(req)?;
3✔
653
                #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
654
                return Ok(DataResponse {
3✔
655
                    metadata: DataResponseMetadata::default(),
3✔
656
                    payload: Some(DataPayload::from_owned(
3✔
657
                        self.generate_rule_break_data(include_str!(concat!("rules/", $rules))),
3✔
658
                    )),
659
                });
×
660
            }
3✔
661
        }
662

663
        impl IterableDataProvider<$marker> for crate::DatagenProvider {
664
            fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> {
3✔
665
                Ok(vec![Default::default()])
3✔
666
            }
3✔
667
        }
668
    }
669
}
670

671
implement!(LineBreakDataV1Marker, "line.toml");
672
implement!(GraphemeClusterBreakDataV1Marker, "grapheme.toml");
673
implement!(WordBreakDataV1Marker, "word.toml");
674
implement!(SentenceBreakDataV1Marker, "sentence.toml");
675

676
#[cfg(test)]
677
mod tests {
678
    use super::*;
679

680
    #[test]
681
    fn load_grapheme_cluster_data() {
2✔
682
        let provider = crate::DatagenProvider::new_testing();
1✔
683
        let payload: DataPayload<GraphemeClusterBreakDataV1Marker> = provider
1✔
684
            .load(Default::default())
1✔
685
            .expect("Loading should succeed!")
686
            .take_payload()
687
            .expect("Data should be present!");
688
        let data: &RuleBreakDataV1 = payload.get();
1✔
689
        assert_eq!(
1✔
690
            data.complex_property, 127,
691
            "Grapheme cluster data doesn't handle SA"
692
        );
693
    }
2✔
694
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc