• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 7093430059

04 Dec 2023 01:41PM UTC coverage: 73.022% (-0.001%) from 73.023%
7093430059

push

github

web-flow
Exclude baked data sources from rustdoc (#4396)

The baked source converted to HTML adds up to dozens of MB and has no
value as it's not readable.

45378 of 62143 relevant lines covered (73.02%)

278034.04 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.93
/provider/datagen/src/transform/segmenter/mod.rs
1
// This file is part of ICU4X. For terms of use, please see the file
×
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
//! This module contains provider implementations backed by built-in segmentation data.
6

7
#![allow(dead_code)]
8
#![allow(unused_imports)]
9

10
use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData};
11
use icu_collections::codepointtrie::CodePointTrie;
12
use icu_properties::{
13
    maps, sets, EastAsianWidth, GeneralCategory, GraphemeClusterBreak, LineBreak, Script,
14
    SentenceBreak, WordBreak,
15
};
16
use icu_provider::datagen::IterableDataProvider;
17
use icu_provider::prelude::*;
18
use icu_segmenter::provider::*;
19
use icu_segmenter::symbols::*;
20
use std::fmt::Debug;
21
use zerovec::ZeroVec;
22

23
pub(crate) mod dictionary;
24
pub(crate) mod lstm;
25

26
// state machine name define by builtin name
27
// [[tables]]
28
// name = "Double_Quote"
29
//
30
// state machine define for combined state
31
// [[tables]]
32
// name = "Double_Quote_ALetter"
33
// left = "Double_Quote"
34
// right = "ALetter"
35
//
36
// state machine define using code point
37
// [[tables]]
38
// name = "ABC"
39
// codepoint = [32, 33, ...]
40
#[derive(serde::Deserialize, Debug)]
2,428✔
41
struct SegmenterProperty {
42
    name: String,
×
43
    // If codepoint is defined, this is custom define, not builtin define.
44
    codepoint: Option<Vec<u32>>,
×
45
    // If left and right are defined, this define is combined state.
46
    left: Option<String>,
×
47
    right: Option<String>,
×
48
    // This combine state is an intermediate match rule.
49
    interm_break_state: Option<bool>,
×
50
}
51

52
// state machine break result define
53
// The follow is "Double_Quote x Double_Quote".
54
// [[rules]]
55
// left = [ "Double_Qoute" ]
56
// right = [ "Double_Qoute" ]
57
// break_state = true # true if break opportunity.
58
#[derive(serde::Deserialize, Debug)]
814✔
59
struct SegmenterState {
60
    left: Vec<String>,
×
61
    right: Vec<String>,
×
62
    break_state: Option<bool>,
×
63
}
64

65
// rule based segmenter define
66
//
67
// segmenter_type: builtin type. word, sentence or grapheme.
68
// tables: state machine name defines.
69
// rules: state machine rules.
70
//
71
// segmenter_type = "word"
72
// [[tables]]
73
// ...
74
// [[rules]]
75
// ...
76
#[derive(serde::Deserialize, Debug)]
22✔
77
struct SegmenterRuleTable {
78
    segmenter_type: String,
×
79
    tables: Vec<SegmenterProperty>,
×
80
    rules: Vec<SegmenterState>,
×
81
}
82

83
#[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
84
fn generate_rule_break_data(
4,876,034✔
85
    provider: &crate::DatagenProvider,
86
    rules_file: &str,
87
    trie_type: crate::TrieType,
88
) -> RuleBreakDataV1<'static> {
89
    let segmenter = provider
4,876,034✔
90
        .icuexport()
91
        .unwrap()
92
        .read_and_parse_toml::<SegmenterRuleTable>(rules_file)
93
        .expect("The data should be valid!");
94

95
    let data = maps::load_word_break(provider).expect("The data should be valid!");
4,876,034✔
96
    let wb = data.as_borrowed();
4,876,034✔
97

98
    let data = maps::load_grapheme_cluster_break(provider).expect("The data should be valid!");
4✔
99
    let gb = data.as_borrowed();
4,876,034✔
100

101
    let data = maps::load_sentence_break(provider).expect("The data should be valid!");
4✔
102
    let sb = data.as_borrowed();
4,876,034✔
103

104
    let data = maps::load_line_break(provider).expect("The data should be valid!");
4✔
105
    let lb = data.as_borrowed();
4,876,034✔
106

107
    let data = maps::load_east_asian_width(provider).expect("The data should be valid!");
4✔
108
    let eaw = data.as_borrowed();
4,876,034✔
109

110
    let data = maps::load_general_category(provider).expect("The data should be valid!");
4✔
111
    let gc = data.as_borrowed();
4,876,034✔
112

113
    let data = maps::load_script(provider).expect("The data should be valid");
4✔
114
    let script = data.as_borrowed();
4,876,034✔
115

116
    let data = sets::load_extended_pictographic(provider).expect("The data should be valid!");
4✔
117
    let extended_pictographic = data.as_borrowed();
4,876,034✔
118

119
    let data =
120
        GraphemeClusterBreak::get_name_to_enum_mapper(provider).expect("The data should be vaild!");
4✔
121
    let gcb_name_to_enum = data.as_borrowed();
4,876,034✔
122

123
    let data = LineBreak::get_name_to_enum_mapper(provider).expect("The data should be vaild!");
4✔
124
    let lb_name_to_enum = data.as_borrowed();
4,876,034✔
125

126
    let data = SentenceBreak::get_name_to_enum_mapper(provider).expect("The data should be vaild!");
4✔
127
    let sb_name_to_enum = data.as_borrowed();
4,876,034✔
128

129
    let data = WordBreak::get_name_to_enum_mapper(provider).expect("The data should be vaild!");
4✔
130
    let wb_name_to_enum = data.as_borrowed();
4,876,034✔
131

132
    fn set_break_state(
32,132✔
133
        break_state_table: &mut [i8],
134
        property_length: usize,
135
        left_index: usize,
136
        right_index: usize,
137
        break_state: i8,
138
    ) {
139
        let index = left_index * property_length + right_index;
32,132✔
140
        if break_state_table[index] == UNKNOWN_RULE || break_state_table[index] == NOT_MATCH_RULE {
61,228✔
141
            break_state_table[index] = break_state;
29,096✔
142
        }
143
    }
32,132✔
144

145
    fn get_index_from_name(properties_names: &[String], s: &str) -> Option<usize> {
2,078✔
146
        properties_names.iter().position(|n| n.eq(s))
70,014✔
147
    }
2,078✔
148

149
    fn is_cjk_fullwidth(
210✔
150
        eaw: maps::CodePointMapDataBorrowed<EastAsianWidth>,
151
        codepoint: u32,
152
    ) -> bool {
153
        matches!(
210✔
154
            eaw.get32(codepoint),
210✔
155
            EastAsianWidth::Ambiguous | EastAsianWidth::Fullwidth | EastAsianWidth::Wide
156
        )
157
    }
210✔
158

159
    // As of Unicode 14.0.0, the break property and the largest codepoint defined in UCD are
160
    // summarized in the following list. See details in the property txt in
161
    // https://www.unicode.org/Public/14.0.0/ucd/
162
    //
163
    // Line Break Property: U+E01EF ; CM [1]
164
    // Grapheme Break Property: U+E0FFF ; Control
165
    // Sentence Break Property: U+E01EF ; Extend
166
    // Word Break Property: U+E01EF ; Extend
167
    //
168
    // The table length should be large enough to contain all codepoints.
169
    //
170
    // [1] In LineBreak.txt, it defines F0000..FFFFD and 100000..10FFFD to be "XX", which are
171
    // the default unassigned values, so it's ok to omit them in the table.
172
    const CODEPOINT_TABLE_LEN: usize = 0xE1000;
173

174
    let mut properties_map = vec![0; CODEPOINT_TABLE_LEN];
9,752,064✔
175
    let mut properties_names = Vec::<String>::new();
4,876,034✔
176
    let mut simple_properties_count = 0;
4,876,034✔
177

178
    properties_names.push("Unknown".to_string());
4,876,034✔
179
    simple_properties_count += 1;
4✔
180

181
    for p in &segmenter.tables {
486,813✔
182
        let property_index = if !properties_names.contains(&p.name) {
508✔
183
            properties_names.push(p.name.clone());
220✔
184
            (properties_names.len() - 1) as u8
220✔
185
        } else {
186
            continue;
187
        };
188

189
        if p.left.is_none() && p.right.is_none() && p.codepoint.is_none() {
220✔
190
            // If any values aren't set, this is builtin type.
191
            simple_properties_count += 1;
122✔
192

193
            match &*segmenter.segmenter_type {
122✔
194
                "word" => {
122✔
195
                    // Extended_Pictographic isn't a part of word break property
196
                    // Extended pictographic property is within 0..U+0x20000
197
                    if p.name == "Extended_Pictographic" {
×
198
                        for i in 0..0x20000 {
×
199
                            if let Some(c) = char::from_u32(i) {
×
200
                                if extended_pictographic.contains(c) {
×
201
                                    properties_map[c as usize] = property_index
×
202
                                }
203
                            }
204
                        }
205
                        continue;
206
                    }
207

208
                    if p.name == "SA" {
×
209
                        // Word break property doesn't define SA, but we will use non-UAX29 rules.
210
                        // SA/CJ property is within 0..U+0x40000
211
                        for c in 0..0x40000 {
×
212
                            if lb.get32(c) == LineBreak::ComplexContext {
×
213
                                properties_map[c as usize] = property_index
×
214
                            } else if let Some(c) = char::from_u32(c) {
×
215
                                match script.get(c) {
×
216
                                    Script::Han | Script::Hiragana => {
×
217
                                        properties_map[c as usize] = property_index;
×
218
                                    }
219

220
                                    _ => {}
221
                                }
222
                            }
223
                        }
224
                        continue;
225
                    }
226

227
                    // TODO(#2239):
228
                    // How to handle Katakana in UAX29? UAX29 defines Katakana rule, but CJ dictionary has another rules.
229
                    // Katakana will use UAX#29 rules instead of dictionary.
230

231
                    let prop = wb_name_to_enum
×
232
                        .get_loose(&p.name)
×
233
                        .expect("property name should be valid!");
234
                    for c in 0..(CODEPOINT_TABLE_LEN as u32) {
×
235
                        if wb.get32(c) == prop {
×
236
                            properties_map[c as usize] = property_index;
×
237
                        }
238
                    }
239
                    continue;
240
                }
241

242
                "grapheme" => {
122✔
243
                    // Extended_Pictographic isn't a part of grapheme break property
244
                    // Extended pictographic property is within 0..U+0x20000
245
                    if p.name == "Extended_Pictographic" {
28✔
246
                        for i in 0..0x20000 {
262,146✔
247
                            if let Some(c) = char::from_u32(i) {
262,144✔
248
                                if extended_pictographic.contains(c) {
265,122✔
249
                                    properties_map[c as usize] = property_index
7,074✔
250
                                }
251
                            }
252
                        }
253
                        continue;
254
                    }
255

256
                    let prop = gcb_name_to_enum
26✔
257
                        .get_loose(&p.name)
26✔
258
                        .expect("property name should be valid!");
259
                    for c in 0..(CODEPOINT_TABLE_LEN as u32) {
23,961,626✔
260
                        if gb.get32(c) == prop {
23,997,606✔
261
                            properties_map[c as usize] = property_index;
36,006✔
262
                        }
263
                    }
264
                    continue;
265
                }
266

267
                "sentence" => {
94✔
268
                    let prop = sb_name_to_enum
×
269
                        .get_loose(&p.name)
×
270
                        .expect("property name should be valid!");
271
                    for c in 0..(CODEPOINT_TABLE_LEN as u32) {
×
272
                        if sb.get32(c) == prop {
×
273
                            properties_map[c as usize] = property_index;
×
274
                        }
275
                    }
276
                    continue;
277
                }
278

279
                "line" => {
94✔
280
                    if p.name == "CP_EA"
544✔
281
                        || p.name == "OP_OP30"
94✔
282
                        || p.name == "OP_EA"
92✔
283
                        || p.name == "ID_CN"
90✔
284
                        || p.name == "PO_EAW"
88✔
285
                        || p.name == "PR_EAW"
86✔
286
                    {
287
                        for i in 0..(CODEPOINT_TABLE_LEN as u32) {
8,975,561✔
288
                            match lb.get32(i) {
8,930,364✔
289
                                LineBreak::OpenPunctuation => {
290
                                    if (p.name == "OP_OP30"
2,126✔
291
                                        && (eaw.get32(i) != EastAsianWidth::Fullwidth
546✔
292
                                            && eaw.get32(i) != EastAsianWidth::Halfwidth
180✔
293
                                            && eaw.get32(i) != EastAsianWidth::Wide))
178✔
294
                                        || (p.name == "OP_EA"
998✔
295
                                            && (eaw.get32(i) == EastAsianWidth::Fullwidth
546✔
296
                                                || eaw.get32(i) == EastAsianWidth::Halfwidth
180✔
297
                                                || eaw.get32(i) == EastAsianWidth::Wide))
178✔
298
                                    {
299
                                        properties_map[i as usize] = property_index;
188✔
300
                                    }
301
                                }
302

303
                                LineBreak::CloseParenthesis => {
304
                                    // CP_EA is unused on the latest spec.
305
                                    if p.name == "CP_EA"
20✔
306
                                        && (eaw.get32(i) == EastAsianWidth::Fullwidth
×
307
                                            || eaw.get32(i) == EastAsianWidth::Halfwidth
×
308
                                            || eaw.get32(i) == EastAsianWidth::Wide)
×
309
                                    {
310
                                        properties_map[i as usize] = property_index;
×
311
                                    }
312
                                }
313

314
                                LineBreak::Ideographic => {
315
                                    if p.name == "ID_CN"
2,046,059✔
316
                                        && gc.get32(i) == GeneralCategory::Unassigned
344,930✔
317
                                    {
318
                                        if let Some(c) = char::from_u32(i) {
125,200✔
319
                                            if extended_pictographic.contains(c) {
128,192✔
320
                                                properties_map[i as usize] = property_index;
2,992✔
321
                                            }
322
                                        }
323
                                    }
324
                                }
325

326
                                LineBreak::PostfixNumeric => {
327
                                    if p.name == "PO_EAW" && is_cjk_fullwidth(eaw, i) {
400✔
328
                                        properties_map[i as usize] = property_index;
20✔
329
                                    }
330
                                }
331

332
                                LineBreak::PrefixNumeric => {
333
                                    if p.name == "PR_EAW" && is_cjk_fullwidth(eaw, i) {
688✔
334
                                        properties_map[i as usize] = property_index;
18✔
335
                                    }
336
                                }
337

338
                                _ => {}
339
                            }
340
                        }
341
                        continue;
342
                    }
343

344
                    let prop = lb_name_to_enum
84✔
345
                        .get_loose(&p.name)
84✔
346
                        .expect("property name should be valid!");
347
                    for c in 0..(CODEPOINT_TABLE_LEN as u32) {
74,304,899✔
348
                        if lb.get32(c) == prop {
73,976,811✔
349
                            properties_map[c as usize] = property_index;
1,843,012✔
350
                        }
351
                    }
352
                    continue;
353
                }
354

355
                _ => {
356
                    panic!("unknown built-in segmenter type");
×
357
                }
358
            }
359
        }
360

361
        if let Some(codepoint) = &p.codepoint {
98✔
362
            simple_properties_count += 1;
×
363
            for c in codepoint {
×
364
                let c = *c as usize;
×
365
                if c > CODEPOINT_TABLE_LEN {
×
366
                    continue;
367
                }
368
                properties_map[c] = property_index;
×
369
            }
370
        }
371
    }
372

373
    // sot and eot
374
    properties_names.push("sot".to_string());
4✔
375
    properties_names.push("eot".to_string());
4✔
376

377
    let rule_size = properties_names.len() * properties_names.len();
4✔
378
    let mut break_state_table = vec![UNKNOWN_RULE; rule_size];
4✔
379

380
    for rule in &segmenter.rules {
2,088✔
381
        let break_state = if let Some(state) = rule.break_state {
148✔
382
            if state {
148✔
383
                BREAK_RULE
42✔
384
            } else {
385
                KEEP_RULE
106✔
386
            }
387
        } else {
388
            NOT_MATCH_RULE
×
389
        };
390

391
        for l in &rule.left {
148✔
392
            if l == "Any" {
488✔
393
                // Special case: left is Any
394
                for r in &rule.right {
24✔
395
                    if r == "Any" {
46✔
396
                        // Fill all unknown state.
397
                        for item in break_state_table.iter_mut().take(rule_size) {
5,590✔
398
                            if *item == UNKNOWN_RULE {
5,586✔
399
                                *item = break_state;
5,586✔
400
                            }
401
                        }
402
                    } else {
403
                        let right_index = get_index_from_name(&properties_names, r).unwrap();
42✔
404
                        for i in 0..simple_properties_count {
2,058✔
405
                            set_break_state(
2,016✔
406
                                &mut break_state_table,
2,016✔
407
                                properties_names.len(),
2,016✔
408
                                i,
409
                                right_index,
410
                                break_state,
2,016✔
411
                            );
412
                        }
413
                    }
414
                }
415
                continue;
416
            }
417
            let left_index = get_index_from_name(&properties_names, l).unwrap();
464✔
418
            for r in &rule.right {
464✔
419
                // Special case: right is Any
420
                if r == "Any" {
922✔
421
                    for i in 0..properties_names.len() {
12,070✔
422
                        set_break_state(
11,944✔
423
                            &mut break_state_table,
11,944✔
424
                            properties_names.len(),
11,944✔
425
                            left_index,
426
                            i,
427
                            break_state,
11,944✔
428
                        );
429
                    }
430
                    continue;
431
                }
432
                let right_index = get_index_from_name(&properties_names, r).unwrap();
796✔
433
                if r != "eot"
1,780✔
434
                    && break_state_table[left_index * properties_names.len() + right_index]
766✔
435
                        == NOT_MATCH_RULE
436
                {
437
                    break_state_table[left_index * properties_names.len() + right_index] =
218✔
438
                        UNKNOWN_RULE;
439
                }
440
                set_break_state(
796✔
441
                    &mut break_state_table,
796✔
442
                    properties_names.len(),
796✔
443
                    left_index,
444
                    right_index,
445
                    break_state,
796✔
446
                );
447
                // Fill not match for combine state
448
                for i in 0..properties_names.len() {
18,172✔
449
                    if left_index >= simple_properties_count {
17,376✔
450
                        set_break_state(
17,376✔
451
                            &mut break_state_table,
17,376✔
452
                            properties_names.len(),
17,376✔
453
                            left_index,
454
                            i,
455
                            NOT_MATCH_RULE,
456
                        );
457
                    }
458
                }
459
            }
460
        }
461
    }
462

463
    // State machine alias
464
    for p in &segmenter.tables {
512✔
465
        if let Some(left) = &p.left {
508✔
466
            if let Some(right) = &p.right {
772✔
467
                let right_index = get_index_from_name(&properties_names, right).unwrap();
386✔
468
                let left_index = get_index_from_name(&properties_names, left).unwrap();
386✔
469
                let interm_break_state = if p.interm_break_state.is_some() {
386✔
470
                    INTERMEDIATE_MATCH_RULE
×
471
                } else {
472
                    0
386✔
473
                };
474

475
                let index = properties_names.iter().position(|n| n.eq(&p.name)).unwrap() as i8;
19,926✔
476
                break_state_table[left_index * properties_names.len() + right_index] =
386✔
477
                    index | interm_break_state;
386✔
478
            }
479
        }
480
    }
481

482
    let rule_status_table = if segmenter.segmenter_type == "word" {
4✔
483
        segmenter
×
484
            .tables
485
            .iter()
486
            .map(|p| {
×
487
                (match &*p.name {
×
488
                    "Numeric" => RuleStatusType::Number,
×
489
                    "ALetter" | "Hebrew_Letter" | "ExtendNumLet" | "Katakana" | "SA" => {
×
490
                        RuleStatusType::Letter
×
491
                    }
492
                    _ => RuleStatusType::None,
×
493
                }) as u8
494
            })
×
495
            .collect()
×
496
    } else {
497
        Default::default()
4✔
498
    };
499

500
    RuleBreakDataV1 {
4✔
501
        property_table: RuleBreakPropertyTable(
4✔
502
            CodePointTrieBuilder {
4✔
503
                data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map),
4✔
504
                default_value: 0,
505
                error_value: 0,
506
                trie_type: match trie_type {
4✔
507
                    crate::TrieType::Fast => icu_collections::codepointtrie::TrieType::Fast,
×
508
                    crate::TrieType::Small => icu_collections::codepointtrie::TrieType::Small,
4✔
509
                },
510
            }
511
            .build(),
512
        ),
513
        break_state_table: RuleBreakStateTable(ZeroVec::new_owned(break_state_table)),
4✔
514
        rule_status_table: RuleStatusTable(ZeroVec::new_owned(rule_status_table)),
4✔
515
        property_count: properties_names.len() as u8,
4✔
516
        last_codepoint_property: (simple_properties_count - 1) as i8,
4✔
517
        sot_property: (properties_names.len() - 2) as u8,
4✔
518
        eot_property: (properties_names.len() - 1) as u8,
4✔
519
        // Return 127 if the complex language isn't handled.
520
        complex_property: get_index_from_name(&properties_names, "SA").unwrap_or(127) as u8,
4✔
521
    }
×
522
}
4✔
523

524
macro_rules! implement {
525
    ($marker:ident, $rules:literal) => {
526
        impl DataProvider<$marker> for crate::DatagenProvider {
527
            fn load(&self, req: DataRequest) -> Result<DataResponse<$marker>, DataError> {
4✔
528
                #[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))]
529
                return Err(DataError::custom(
530
                    "icu_datagen must be built with use_icu4c or use_wasm to build segmentation rules",
531
                )
532
                .with_req($marker::KEY, req));
533
                #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))]
534
                return {
4✔
535
                    self.check_req::<$marker>(req)?;
4✔
536
                    let data = generate_rule_break_data(
4✔
537
                        &hardcoded_segmenter_provider(),
4✔
538
                        $rules,
539
                        self.trie_type(),
4✔
540
                    );
4✔
541

542
                    Ok(DataResponse {
4✔
543
                        metadata: DataResponseMetadata::default(),
4✔
544
                        payload: Some(DataPayload::from_owned(data)),
4✔
545
                    })
×
546
                };
4✔
547
            }
4✔
548
        }
549

550
        impl IterableDataProvider<$marker> for crate::DatagenProvider {
551
            fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> {
4✔
552
                Ok(vec![Default::default()])
4✔
553
            }
4✔
554
        }
555
    }
556
}
557

558
fn hardcoded_segmenter_provider() -> crate::DatagenProvider {
4✔
559
    #![allow(deprecated)]
560
    use crate::{
561
        source::{AbstractFs, SerdeCache},
562
        DatagenProvider, TrieType,
563
    };
564
    // Singleton so that all instantiations share the same cache.
565
    static SINGLETON: once_cell::sync::OnceCell<DatagenProvider> = once_cell::sync::OnceCell::new();
566
    SINGLETON
4✔
567
        .get_or_init(|| {
1✔
568
            let mut provider = DatagenProvider::new_custom();
1✔
569
            provider.source.icuexport_paths =
1✔
570
                Some(std::sync::Arc::new(SerdeCache::new(AbstractFs::Memory(
1✔
571
                    [
1✔
572
                        (
1✔
573
                            "uprops/small/ea.toml",
574
                            include_bytes!("../../../data/segmenter/uprops/small/ea.toml")
1✔
575
                                .as_slice(),
576
                        ),
577
                        (
1✔
578
                            "uprops/small/ExtPict.toml",
579
                            include_bytes!("../../../data/segmenter/uprops/small/ExtPict.toml")
1✔
580
                                .as_slice(),
581
                        ),
582
                        (
1✔
583
                            "uprops/small/gc.toml",
584
                            include_bytes!("../../../data/segmenter/uprops/small/gc.toml")
1✔
585
                                .as_slice(),
586
                        ),
587
                        (
1✔
588
                            "uprops/small/GCB.toml",
589
                            include_bytes!("../../../data/segmenter/uprops/small/GCB.toml")
1✔
590
                                .as_slice(),
591
                        ),
592
                        (
1✔
593
                            "uprops/small/lb.toml",
594
                            include_bytes!("../../../data/segmenter/uprops/small/lb.toml")
1✔
595
                                .as_slice(),
596
                        ),
597
                        (
1✔
598
                            "uprops/small/SB.toml",
599
                            include_bytes!("../../../data/segmenter/uprops/small/SB.toml")
1✔
600
                                .as_slice(),
601
                        ),
602
                        (
1✔
603
                            "uprops/small/sc.toml",
604
                            include_bytes!("../../../data/segmenter/uprops/small/sc.toml")
1✔
605
                                .as_slice(),
606
                        ),
607
                        (
1✔
608
                            "uprops/small/WB.toml",
609
                            include_bytes!("../../../data/segmenter/uprops/small/WB.toml")
1✔
610
                                .as_slice(),
611
                        ),
612
                        (
1✔
613
                            "segmenter/grapheme.toml",
614
                            include_bytes!("../../../data/segmenter/grapheme.toml").as_slice(),
1✔
615
                        ),
616
                        (
1✔
617
                            "segmenter/line.toml",
618
                            include_bytes!("../../../data/segmenter/line.toml").as_slice(),
1✔
619
                        ),
620
                        (
1✔
621
                            "segmenter/sentence.toml",
622
                            include_bytes!("../../../data/segmenter/sentence.toml").as_slice(),
1✔
623
                        ),
624
                        (
1✔
625
                            "segmenter/word.toml",
626
                            include_bytes!("../../../data/segmenter/word.toml").as_slice(),
1✔
627
                        ),
628
                    ]
629
                    .into_iter()
630
                    .collect(),
631
                ))));
632
            provider
633
        })
1✔
634
        .clone()
635
}
4✔
636

637
implement!(LineBreakDataV1Marker, "segmenter/line.toml");
638
implement!(GraphemeClusterBreakDataV1Marker, "segmenter/grapheme.toml");
639
implement!(WordBreakDataV1Marker, "segmenter/word.toml");
640
implement!(SentenceBreakDataV1Marker, "segmenter/sentence.toml");
641

642
#[cfg(test)]
643
mod tests {
644
    use super::*;
645

646
    #[test]
647
    fn load_grapheme_cluster_data() {
2✔
648
        let provider = crate::DatagenProvider::new_testing();
1✔
649
        let payload: DataPayload<GraphemeClusterBreakDataV1Marker> = provider
1✔
650
            .load(Default::default())
1✔
651
            .expect("Loading should succeed!")
652
            .take_payload()
653
            .expect("Data should be present!");
654
        let data = payload.get();
1✔
655
        assert_eq!(
1✔
656
            data.complex_property, 127,
657
            "Grapheme cluster data doesn't handle SA"
658
        );
659
    }
2✔
660

661
    #[test]
662
    fn load_line_data() {
2✔
663
        let provider = crate::DatagenProvider::new_testing();
1✔
664
        let payload: DataPayload<LineBreakDataV1Marker> = provider
1✔
665
            .load(Default::default())
1✔
666
            .expect("Loading should succeed!")
667
            .take_payload()
668
            .expect("Data should be present!");
669
        let data = payload.get();
1✔
670
        // Note: The following match statement had been used in line.rs:
671
        //
672
        // match codepoint {
673
        //     0x20000..=0x2fffd => ID,
674
        //     0x30000..=0x3fffd => ID,
675
        //     0xe0001 => CM,
676
        //     0xe0020..=0xe007f => CM,
677
        //     0xe0100..=0xe01ef => CM,
678
        //     _ => XX,
679
        // }
680
        assert_eq!(data.property_table.0.get32(0x20000), ID);
1✔
681
        assert_eq!(data.property_table.0.get32(0x3fffd), ID);
1✔
682
        assert_eq!(data.property_table.0.get32(0xd0000), XX);
1✔
683
        assert_eq!(data.property_table.0.get32(0xe0001), CM);
1✔
684
        assert_eq!(data.property_table.0.get32(0xe0020), CM);
1✔
685
    }
2✔
686
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc