• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 9357137046

03 Jun 2024 08:51PM UTC coverage: 75.121% (-1.1%) from 76.254%
9357137046

push

github

web-flow
Switch locid Value to use Subtag (#4941)

This is part of #1833 switching Value API to use Subtag.

61 of 71 new or added lines in 11 files covered. (85.92%)

3224 existing lines in 178 files now uncovered.

52958 of 70497 relevant lines covered (75.12%)

572757.08 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

67.8
/components/segmenter/src/complex/mod.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::provider::*;
6
use alloc::vec::Vec;
7
use icu_provider::prelude::*;
8

9
mod dictionary;
10
use dictionary::*;
11
mod language;
12
use language::*;
13
#[cfg(feature = "lstm")]
14
mod lstm;
15
#[cfg(feature = "lstm")]
16
use lstm::*;
17

18
#[cfg(not(feature = "lstm"))]
19
type DictOrLstm = Result<DataPayload<UCharDictionaryBreakDataV1Marker>, core::convert::Infallible>;
20
#[cfg(not(feature = "lstm"))]
21
type DictOrLstmBorrowed<'a> =
22
    Result<&'a DataPayload<UCharDictionaryBreakDataV1Marker>, &'a core::convert::Infallible>;
23

24
#[cfg(feature = "lstm")]
25
type DictOrLstm =
26
    Result<DataPayload<UCharDictionaryBreakDataV1Marker>, DataPayload<LstmForWordLineAutoV1Marker>>;
27
#[cfg(feature = "lstm")]
28
type DictOrLstmBorrowed<'a> = Result<
29
    &'a DataPayload<UCharDictionaryBreakDataV1Marker>,
30
    &'a DataPayload<LstmForWordLineAutoV1Marker>,
31
>;
32

UNCOV
33
#[derive(Debug)]
×
34
pub(crate) struct ComplexPayloads {
UNCOV
35
    grapheme: DataPayload<GraphemeClusterBreakDataV1Marker>,
×
36
    my: Option<DictOrLstm>,
×
37
    km: Option<DictOrLstm>,
×
38
    lo: Option<DictOrLstm>,
×
39
    th: Option<DictOrLstm>,
×
40
    ja: Option<DataPayload<UCharDictionaryBreakDataV1Marker>>,
×
41
}
42

43
impl ComplexPayloads {
44
    fn select(&self, language: Language) -> Option<DictOrLstmBorrowed> {
78✔
45
        const ERR: DataError = DataError::custom("No segmentation model for language");
46
        match language {
78✔
47
            Language::Burmese => self.my.as_ref().map(Result::as_ref).or_else(|| {
5✔
UNCOV
48
                ERR.with_display_context("my");
×
49
                None
×
50
            }),
×
51
            Language::Khmer => self.km.as_ref().map(Result::as_ref).or_else(|| {
6✔
UNCOV
52
                ERR.with_display_context("km");
×
53
                None
×
54
            }),
×
55
            Language::Lao => self.lo.as_ref().map(Result::as_ref).or_else(|| {
6✔
UNCOV
56
                ERR.with_display_context("lo");
×
57
                None
×
58
            }),
×
59
            Language::Thai => self.th.as_ref().map(Result::as_ref).or_else(|| {
49✔
UNCOV
60
                ERR.with_display_context("th");
×
61
                None
×
62
            }),
×
63
            Language::ChineseOrJapanese => self.ja.as_ref().map(Ok).or_else(|| {
13✔
64
                ERR.with_display_context("ja");
1✔
65
                None
1✔
66
            }),
1✔
UNCOV
67
            Language::Unknown => None,
×
68
        }
69
    }
78✔
70

71
    #[cfg(feature = "lstm")]
72
    #[cfg(feature = "compiled_data")]
73
    pub(crate) fn new_lstm() -> Self {
15✔
74
        #[allow(clippy::unwrap_used)]
75
        // try_load is infallible if the provider only returns `MissingLocale`.
76
        Self {
15✔
77
            grapheme: DataPayload::from_static_ref(
15✔
78
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
79
            ),
80
            my: try_load::<LstmForWordLineAutoV1Marker, _>(
15✔
81
                &crate::provider::Baked,
82
                "Burmese_codepoints_exclusive_model4_heavy",
83
            )
84
            .unwrap()
85
            .map(DataPayload::cast)
86
            .map(Err),
87
            km: try_load::<LstmForWordLineAutoV1Marker, _>(
15✔
88
                &crate::provider::Baked,
89
                "Khmer_codepoints_exclusive_model4_heavy",
90
            )
91
            .unwrap()
92
            .map(DataPayload::cast)
93
            .map(Err),
94
            lo: try_load::<LstmForWordLineAutoV1Marker, _>(
15✔
95
                &crate::provider::Baked,
96
                "Lao_codepoints_exclusive_model4_heavy",
97
            )
98
            .unwrap()
99
            .map(DataPayload::cast)
100
            .map(Err),
101
            th: try_load::<LstmForWordLineAutoV1Marker, _>(
15✔
102
                &crate::provider::Baked,
103
                "Thai_codepoints_exclusive_model4_heavy",
104
            )
105
            .unwrap()
106
            .map(DataPayload::cast)
107
            .map(Err),
108
            ja: None,
15✔
UNCOV
109
        }
×
110
    }
15✔
111

112
    #[cfg(feature = "lstm")]
113
    pub(crate) fn try_new_lstm<D>(provider: &D) -> Result<Self, DataError>
1✔
114
    where
115
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
116
            + DataProvider<LstmForWordLineAutoV1Marker>
117
            + ?Sized,
118
    {
119
        Ok(Self {
1✔
120
            grapheme: provider.load(Default::default())?.take_payload()?,
1✔
121
            my: try_load::<LstmForWordLineAutoV1Marker, D>(
1✔
122
                provider,
123
                "Burmese_codepoints_exclusive_model4_heavy",
UNCOV
124
            )?
×
125
            .map(DataPayload::cast)
126
            .map(Err),
127
            km: try_load::<LstmForWordLineAutoV1Marker, D>(
1✔
128
                provider,
129
                "Khmer_codepoints_exclusive_model4_heavy",
UNCOV
130
            )?
×
131
            .map(DataPayload::cast)
132
            .map(Err),
133
            lo: try_load::<LstmForWordLineAutoV1Marker, D>(
1✔
134
                provider,
135
                "Lao_codepoints_exclusive_model4_heavy",
UNCOV
136
            )?
×
137
            .map(DataPayload::cast)
138
            .map(Err),
139
            th: try_load::<LstmForWordLineAutoV1Marker, D>(
1✔
140
                provider,
141
                "Thai_codepoints_exclusive_model4_heavy",
UNCOV
142
            )?
×
143
            .map(DataPayload::cast)
144
            .map(Err),
145
            ja: None,
1✔
UNCOV
146
        })
×
147
    }
1✔
148

149
    #[cfg(feature = "compiled_data")]
150
    pub(crate) fn new_dict() -> Self {
7✔
151
        #[allow(clippy::unwrap_used)]
152
        // try_load is infallible if the provider only returns `MissingLocale`.
153
        Self {
7✔
154
            grapheme: DataPayload::from_static_ref(
7✔
155
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
156
            ),
157
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
7✔
158
                &crate::provider::Baked,
159
                "burmesedict",
160
            )
161
            .unwrap()
162
            .map(DataPayload::cast)
163
            .map(Ok),
164
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
7✔
165
                &crate::provider::Baked,
166
                "khmerdict",
167
            )
168
            .unwrap()
169
            .map(DataPayload::cast)
170
            .map(Ok),
171
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
7✔
172
                &crate::provider::Baked,
173
                "laodict",
174
            )
175
            .unwrap()
176
            .map(DataPayload::cast)
177
            .map(Ok),
178
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
7✔
179
                &crate::provider::Baked,
180
                "thaidict",
181
            )
182
            .unwrap()
183
            .map(DataPayload::cast)
184
            .map(Ok),
185
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(&crate::provider::Baked, "cjdict")
7✔
186
                .unwrap()
187
                .map(DataPayload::cast),
UNCOV
188
        }
×
189
    }
7✔
190

UNCOV
191
    pub(crate) fn try_new_dict<D>(provider: &D) -> Result<Self, DataError>
×
192
    where
193
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
194
            + DataProvider<DictionaryForWordLineExtendedV1Marker>
195
            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
196
            + ?Sized,
197
    {
UNCOV
198
        Ok(Self {
×
UNCOV
199
            grapheme: provider.load(Default::default())?.take_payload()?,
×
UNCOV
200
            my: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, "burmesedict")?
×
201
                .map(DataPayload::cast)
202
                .map(Ok),
UNCOV
203
            km: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, "khmerdict")?
×
204
                .map(DataPayload::cast)
205
                .map(Ok),
UNCOV
206
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, "laodict")?
×
207
                .map(DataPayload::cast)
208
                .map(Ok),
UNCOV
209
            th: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, "thaidict")?
×
210
                .map(DataPayload::cast)
211
                .map(Ok),
UNCOV
212
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, "cjdict")?
×
213
                .map(DataPayload::cast),
UNCOV
214
        })
×
UNCOV
215
    }
×
216

217
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
218
    #[cfg(feature = "compiled_data")]
219
    pub(crate) fn new_auto() -> Self {
16✔
220
        #[allow(clippy::unwrap_used)]
221
        // try_load is infallible if the provider only returns `MissingLocale`.
222
        Self {
4✔
223
            grapheme: DataPayload::from_static_ref(
16✔
224
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
225
            ),
226
            my: try_load::<LstmForWordLineAutoV1Marker, _>(
16✔
227
                &crate::provider::Baked,
228
                "Burmese_codepoints_exclusive_model4_heavy",
229
            )
230
            .unwrap()
231
            .map(DataPayload::cast)
232
            .map(Err),
233
            km: try_load::<LstmForWordLineAutoV1Marker, _>(
14✔
234
                &crate::provider::Baked,
235
                "Khmer_codepoints_exclusive_model4_heavy",
236
            )
237
            .unwrap()
238
            .map(DataPayload::cast)
239
            .map(Err),
240
            lo: try_load::<LstmForWordLineAutoV1Marker, _>(
12✔
241
                &crate::provider::Baked,
242
                "Lao_codepoints_exclusive_model4_heavy",
243
            )
244
            .unwrap()
245
            .map(DataPayload::cast)
246
            .map(Err),
247
            th: try_load::<LstmForWordLineAutoV1Marker, _>(
12✔
248
                &crate::provider::Baked,
249
                "Thai_codepoints_exclusive_model4_heavy",
250
            )
251
            .unwrap()
252
            .map(DataPayload::cast)
253
            .map(Err),
254
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(&crate::provider::Baked, "cjdict")
8✔
255
                .unwrap()
256
                .map(DataPayload::cast),
UNCOV
257
        }
×
258
    }
4✔
259

260
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
UNCOV
261
    pub(crate) fn try_new_auto<D>(provider: &D) -> Result<Self, DataError>
×
262
    where
263
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
264
            + DataProvider<LstmForWordLineAutoV1Marker>
265
            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
266
            + ?Sized,
267
    {
UNCOV
268
        Ok(Self {
×
UNCOV
269
            grapheme: provider.load(Default::default())?.take_payload()?,
×
UNCOV
270
            my: try_load::<LstmForWordLineAutoV1Marker, D>(
×
271
                provider,
272
                "Burmese_codepoints_exclusive_model4_heavy",
UNCOV
273
            )?
×
274
            .map(DataPayload::cast)
275
            .map(Err),
UNCOV
276
            km: try_load::<LstmForWordLineAutoV1Marker, D>(
×
277
                provider,
278
                "Khmer_codepoints_exclusive_model4_heavy",
UNCOV
279
            )?
×
280
            .map(DataPayload::cast)
281
            .map(Err),
UNCOV
282
            lo: try_load::<LstmForWordLineAutoV1Marker, D>(
×
283
                provider,
284
                "Lao_codepoints_exclusive_model4_heavy",
UNCOV
285
            )?
×
286
            .map(DataPayload::cast)
287
            .map(Err),
UNCOV
288
            th: try_load::<LstmForWordLineAutoV1Marker, D>(
×
289
                provider,
290
                "Thai_codepoints_exclusive_model4_heavy",
UNCOV
291
            )?
×
292
            .map(DataPayload::cast)
293
            .map(Err),
UNCOV
294
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, "cjdict")?
×
295
                .map(DataPayload::cast),
UNCOV
296
        })
×
UNCOV
297
    }
×
298

299
    #[cfg(feature = "compiled_data")]
300
    pub(crate) fn new_southeast_asian() -> Self {
88✔
301
        #[allow(clippy::unwrap_used)]
302
        // try_load is infallible if the provider only returns `MissingLocale`.
303
        Self {
72✔
304
            grapheme: DataPayload::from_static_ref(
92✔
305
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
306
            ),
307
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
88✔
308
                &crate::provider::Baked,
309
                "burmesedict",
310
            )
311
            .unwrap()
312
            .map(DataPayload::cast)
313
            .map(Ok),
314
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
84✔
315
                &crate::provider::Baked,
316
                "khmerdict",
317
            )
318
            .unwrap()
319
            .map(DataPayload::cast)
320
            .map(Ok),
321
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
78✔
322
                &crate::provider::Baked,
323
                "laodict",
324
            )
325
            .unwrap()
326
            .map(DataPayload::cast)
327
            .map(Ok),
328
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
74✔
329
                &crate::provider::Baked,
330
                "thaidict",
331
            )
332
            .unwrap()
333
            .map(DataPayload::cast)
334
            .map(Ok),
335
            ja: None,
72✔
UNCOV
336
        }
×
337
    }
72✔
338

339
    pub(crate) fn try_new_southeast_asian<D>(provider: &D) -> Result<Self, DataError>
1✔
340
    where
341
        D: DataProvider<DictionaryForWordLineExtendedV1Marker>
342
            + DataProvider<GraphemeClusterBreakDataV1Marker>
343
            + ?Sized,
344
    {
345
        Ok(Self {
1✔
346
            grapheme: provider.load(Default::default())?.take_payload()?,
1✔
347
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, "burmesedict")?
1✔
348
                .map(DataPayload::cast)
349
                .map(Ok),
350
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, "khmerdict")?
1✔
351
                .map(DataPayload::cast)
352
                .map(Ok),
353
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, "laodict")?
1✔
354
                .map(DataPayload::cast)
355
                .map(Ok),
356
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, "thaidict")?
1✔
357
                .map(DataPayload::cast)
358
                .map(Ok),
359
            ja: None,
1✔
UNCOV
360
        })
×
361
    }
1✔
362
}
363

364
fn try_load<M: KeyedDataMarker, P: DataProvider<M> + ?Sized>(
562✔
365
    provider: &P,
366
    model: &'static str,
367
) -> Result<Option<DataPayload<M>>, DataError> {
368
    match provider.load(DataRequest {
521✔
369
        key_attributes: &model.parse().unwrap(),
562✔
370
        metadata: {
371
            let mut m = DataRequestMetadata::default();
562✔
372
            m.silent = true;
527✔
373
            m
527✔
374
        },
375
        ..Default::default()
527✔
376
    }) {
377
        Ok(response) => Ok(Some(response.take_payload()?)),
538✔
378
        Err(DataError {
379
            kind: DataErrorKind::MissingLocale,
380
            ..
UNCOV
381
        }) => Ok(None),
×
UNCOV
382
        Err(e) => Err(e),
×
383
    }
384
}
530✔
385

386
/// Return UTF-16 segment offset array using dictionary or lstm segmenter.
387
pub(crate) fn complex_language_segment_utf16(
36✔
388
    payloads: &ComplexPayloads,
389
    input: &[u16],
390
) -> Vec<usize> {
391
    let mut result = Vec::new();
36✔
392
    let mut offset = 0;
36✔
393
    for (slice, lang) in LanguageIteratorUtf16::new(input) {
72✔
394
        match payloads.select(lang) {
36✔
395
            Some(Ok(dict)) => {
12✔
396
                result.extend(
12✔
397
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
12✔
398
                        .segment_utf16(slice)
399
                        .map(|n| offset + n),
53✔
400
                );
401
            }
402
            #[cfg(feature = "lstm")]
403
            Some(Err(lstm)) => {
24✔
404
                result.extend(
24✔
405
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
24✔
406
                        .segment_utf16(slice)
407
                        .map(|n| offset + n),
111✔
408
                );
409
            }
410
            #[cfg(not(feature = "lstm"))]
411
            Some(Err(_infallible)) => {} // should be refutable
412
            None => {
UNCOV
413
                result.push(offset + slice.len());
×
414
            }
415
        }
416
        offset += slice.len();
36✔
417
    }
418
    result
36✔
419
}
36✔
420

421
/// Return UTF-8 segment offset array using dictionary or lstm segmenter.
422
pub(crate) fn complex_language_segment_str(payloads: &ComplexPayloads, input: &str) -> Vec<usize> {
42✔
423
    let mut result = Vec::new();
42✔
424
    let mut offset = 0;
42✔
425
    for (slice, lang) in LanguageIterator::new(input) {
84✔
426
        match payloads.select(lang) {
42✔
427
            Some(Ok(dict)) => {
18✔
428
                result.extend(
18✔
429
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
18✔
430
                        .segment_str(slice)
431
                        .map(|n| offset + n),
72✔
432
                );
433
            }
434
            #[cfg(feature = "lstm")]
435
            Some(Err(lstm)) => {
23✔
436
                result.extend(
23✔
437
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
23✔
438
                        .segment_str(slice)
439
                        .map(|n| offset + n),
107✔
440
                );
441
            }
442
            #[cfg(not(feature = "lstm"))]
443
            Some(Err(_infallible)) => {} // should be refutable
444
            None => {
445
                result.push(offset + slice.len());
1✔
446
            }
447
        }
448
        offset += slice.len();
42✔
449
    }
450
    result
42✔
451
}
42✔
452

453
#[cfg(test)]
454
#[cfg(feature = "serde")]
455
mod tests {
456
    use super::*;
457

458
    #[test]
459
    fn thai_word_break() {
2✔
460
        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
461
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1✔
462

463
        let lstm = ComplexPayloads::new_lstm();
1✔
464
        let dict = ComplexPayloads::new_dict();
1✔
465

466
        assert_eq!(
1✔
467
            complex_language_segment_str(&lstm, TEST_STR),
1✔
468
            [12, 21, 33, 42]
469
        );
470
        assert_eq!(
1✔
471
            complex_language_segment_utf16(&lstm, &utf16),
1✔
472
            [4, 7, 11, 14]
473
        );
474

475
        assert_eq!(
1✔
476
            complex_language_segment_str(&dict, TEST_STR),
1✔
477
            [12, 21, 33, 42]
478
        );
479
        assert_eq!(
1✔
480
            complex_language_segment_utf16(&dict, &utf16),
1✔
481
            [4, 7, 11, 14]
482
        );
483
    }
2✔
484
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc