• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 11904027177

19 Nov 2024 12:33AM UTC coverage: 75.477% (+0.3%) from 75.174%
11904027177

push

github

web-flow
Move DateTimePattern into pattern module (#5834)

#1317

Also removes `NeoNeverMarker` and fixes #5689

258 of 319 new or added lines in 6 files covered. (80.88%)

6967 existing lines in 278 files now uncovered.

54522 of 72237 relevant lines covered (75.48%)

655305.49 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.74
/components/segmenter/src/complex/mod.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::provider::*;
6
use alloc::vec::Vec;
7
use icu_provider::prelude::*;
8

9
mod dictionary;
10
use dictionary::*;
11
mod language;
12
use language::*;
13
#[cfg(feature = "lstm")]
14
mod lstm;
15
#[cfg(feature = "lstm")]
16
use lstm::*;
17

18
#[cfg(not(feature = "lstm"))]
19
type DictOrLstm = Result<DataPayload<UCharDictionaryBreakDataV1Marker>, core::convert::Infallible>;
20
#[cfg(not(feature = "lstm"))]
21
type DictOrLstmBorrowed<'a> =
22
    Result<&'a DataPayload<UCharDictionaryBreakDataV1Marker>, &'a core::convert::Infallible>;
23

24
#[cfg(feature = "lstm")]
25
type DictOrLstm =
26
    Result<DataPayload<UCharDictionaryBreakDataV1Marker>, DataPayload<LstmForWordLineAutoV1Marker>>;
27
#[cfg(feature = "lstm")]
28
type DictOrLstmBorrowed<'a> = Result<
29
    &'a DataPayload<UCharDictionaryBreakDataV1Marker>,
30
    &'a DataPayload<LstmForWordLineAutoV1Marker>,
31
>;
32

33
#[derive(Debug)]
×
34
pub(crate) struct ComplexPayloads {
35
    grapheme: DataPayload<GraphemeClusterBreakDataV2Marker>,
×
36
    my: Option<DictOrLstm>,
×
37
    km: Option<DictOrLstm>,
×
38
    lo: Option<DictOrLstm>,
×
39
    th: Option<DictOrLstm>,
×
40
    ja: Option<DataPayload<UCharDictionaryBreakDataV1Marker>>,
×
41
}
42

43
#[cfg(feature = "lstm")]
44
const MY_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Burmese_");
45
#[cfg(feature = "lstm")]
46
const KM_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Khmer_");
47
#[cfg(feature = "lstm")]
48
const LO_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Lao_");
49
#[cfg(feature = "lstm")]
50
const TH_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Thai_");
51

52
const MY_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("burmesedict");
53
const KM_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("khmerdict");
54
const LO_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("laodict");
55
const TH_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("thaidict");
56
const CJ_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("cjdict");
57

58
impl ComplexPayloads {
59
    fn select(&self, language: Language) -> Option<DictOrLstmBorrowed> {
78✔
60
        const ERR: DataError = DataError::custom("No segmentation model for language");
61
        match language {
78✔
62
            Language::Burmese => self.my.as_ref().map(Result::as_ref).or_else(|| {
5✔
UNCOV
63
                ERR.with_display_context("my");
×
UNCOV
64
                None
×
UNCOV
65
            }),
×
66
            Language::Khmer => self.km.as_ref().map(Result::as_ref).or_else(|| {
6✔
67
                ERR.with_display_context("km");
×
UNCOV
68
                None
×
UNCOV
69
            }),
×
70
            Language::Lao => self.lo.as_ref().map(Result::as_ref).or_else(|| {
6✔
UNCOV
71
                ERR.with_display_context("lo");
×
UNCOV
72
                None
×
UNCOV
73
            }),
×
74
            Language::Thai => self.th.as_ref().map(Result::as_ref).or_else(|| {
49✔
UNCOV
75
                ERR.with_display_context("th");
×
UNCOV
76
                None
×
UNCOV
77
            }),
×
78
            Language::ChineseOrJapanese => self.ja.as_ref().map(Ok).or_else(|| {
13✔
79
                ERR.with_display_context("ja");
1✔
80
                None
1✔
81
            }),
1✔
UNCOV
82
            Language::Unknown => None,
×
83
        }
84
    }
78✔
85

86
    #[cfg(feature = "lstm")]
87
    #[cfg(feature = "compiled_data")]
88
    pub(crate) fn new_lstm() -> Self {
15✔
89
        #[allow(clippy::unwrap_used)]
90
        // try_load is infallible if the provider only returns `MissingLocale`.
91
        Self {
15✔
92
            grapheme: DataPayload::from_static_ref(
15✔
93
                crate::provider::Baked::SINGLETON_GRAPHEME_CLUSTER_BREAK_DATA_V2_MARKER,
94
            ),
95
            my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, MY_LSTM)
15✔
96
                .unwrap()
97
                .map(DataPayload::cast)
98
                .map(Err),
99
            km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, KM_LSTM)
15✔
100
                .unwrap()
101
                .map(DataPayload::cast)
102
                .map(Err),
103
            lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, LO_LSTM)
15✔
104
                .unwrap()
105
                .map(DataPayload::cast)
106
                .map(Err),
107
            th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, TH_LSTM)
15✔
108
                .unwrap()
109
                .map(DataPayload::cast)
110
                .map(Err),
111
            ja: None,
15✔
UNCOV
112
        }
×
113
    }
15✔
114

115
    #[cfg(feature = "lstm")]
116
    pub(crate) fn try_new_lstm<D>(provider: &D) -> Result<Self, DataError>
1✔
117
    where
118
        D: DataProvider<GraphemeClusterBreakDataV2Marker>
119
            + DataProvider<LstmForWordLineAutoV1Marker>
120
            + ?Sized,
121
    {
122
        Ok(Self {
1✔
123
            grapheme: provider.load(Default::default())?.payload,
1✔
124
            my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, MY_LSTM)?
1✔
125
                .map(DataPayload::cast)
126
                .map(Err),
127
            km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, KM_LSTM)?
1✔
128
                .map(DataPayload::cast)
129
                .map(Err),
130
            lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, LO_LSTM)?
1✔
131
                .map(DataPayload::cast)
132
                .map(Err),
133
            th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, TH_LSTM)?
1✔
134
                .map(DataPayload::cast)
135
                .map(Err),
136
            ja: None,
1✔
UNCOV
137
        })
×
138
    }
1✔
139

140
    #[cfg(feature = "compiled_data")]
141
    pub(crate) fn new_dict() -> Self {
5✔
142
        #[allow(clippy::unwrap_used)]
143
        // try_load is infallible if the provider only returns `MissingLocale`.
144
        Self {
5✔
145
            grapheme: DataPayload::from_static_ref(
5✔
146
                crate::provider::Baked::SINGLETON_GRAPHEME_CLUSTER_BREAK_DATA_V2_MARKER,
147
            ),
148
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
5✔
149
                &crate::provider::Baked,
150
                MY_DICT,
151
            )
152
            .unwrap()
153
            .map(DataPayload::cast)
154
            .map(Ok),
155
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
5✔
156
                &crate::provider::Baked,
157
                KM_DICT,
158
            )
159
            .unwrap()
160
            .map(DataPayload::cast)
161
            .map(Ok),
162
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
5✔
163
                &crate::provider::Baked,
164
                LO_DICT,
165
            )
166
            .unwrap()
167
            .map(DataPayload::cast)
168
            .map(Ok),
169
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
5✔
170
                &crate::provider::Baked,
171
                TH_DICT,
172
            )
173
            .unwrap()
174
            .map(DataPayload::cast)
175
            .map(Ok),
176
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(&crate::provider::Baked, CJ_DICT)
5✔
177
                .unwrap()
178
                .map(DataPayload::cast),
UNCOV
179
        }
×
180
    }
5✔
181

182
    pub(crate) fn try_new_dict<D>(provider: &D) -> Result<Self, DataError>
2✔
183
    where
184
        D: DataProvider<GraphemeClusterBreakDataV2Marker>
185
            + DataProvider<DictionaryForWordLineExtendedV1Marker>
186
            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
187
            + ?Sized,
188
    {
189
        Ok(Self {
2✔
190
            grapheme: provider.load(Default::default())?.payload,
2✔
191
            my: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, MY_DICT)?
2✔
192
                .map(DataPayload::cast)
193
                .map(Ok),
194
            km: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, KM_DICT)?
2✔
195
                .map(DataPayload::cast)
196
                .map(Ok),
197
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, LO_DICT)?
2✔
198
                .map(DataPayload::cast)
199
                .map(Ok),
200
            th: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, TH_DICT)?
2✔
201
                .map(DataPayload::cast)
202
                .map(Ok),
203
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, CJ_DICT)?
2✔
204
                .map(DataPayload::cast),
UNCOV
205
        })
×
206
    }
2✔
207

208
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
209
    #[cfg(feature = "compiled_data")]
210
    pub(crate) fn new_auto() -> Self {
14✔
211
        #[allow(clippy::unwrap_used)]
212
        // try_load is infallible if the provider only returns `MissingLocale`.
213
        Self {
10✔
214
            grapheme: DataPayload::from_static_ref(
18✔
215
                crate::provider::Baked::SINGLETON_GRAPHEME_CLUSTER_BREAK_DATA_V2_MARKER,
216
            ),
217
            my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, MY_LSTM)
14✔
218
                .unwrap()
219
                .map(DataPayload::cast)
220
                .map(Err),
221
            km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, KM_LSTM)
14✔
222
                .unwrap()
223
                .map(DataPayload::cast)
224
                .map(Err),
225
            lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, LO_LSTM)
14✔
226
                .unwrap()
227
                .map(DataPayload::cast)
228
                .map(Err),
229
            th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, TH_LSTM)
14✔
230
                .unwrap()
231
                .map(DataPayload::cast)
232
                .map(Err),
233
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(&crate::provider::Baked, CJ_DICT)
14✔
234
                .unwrap()
235
                .map(DataPayload::cast),
UNCOV
236
        }
×
237
    }
10✔
238

239
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
240
    pub(crate) fn try_new_auto<D>(provider: &D) -> Result<Self, DataError>
2✔
241
    where
242
        D: DataProvider<GraphemeClusterBreakDataV2Marker>
243
            + DataProvider<LstmForWordLineAutoV1Marker>
244
            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
245
            + ?Sized,
246
    {
247
        Ok(Self {
2✔
248
            grapheme: provider.load(Default::default())?.payload,
2✔
249
            my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, MY_LSTM)?
2✔
250
                .map(DataPayload::cast)
251
                .map(Err),
252
            km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, KM_LSTM)?
2✔
253
                .map(DataPayload::cast)
254
                .map(Err),
255
            lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, LO_LSTM)?
2✔
256
                .map(DataPayload::cast)
257
                .map(Err),
258
            th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, TH_LSTM)?
2✔
259
                .map(DataPayload::cast)
260
                .map(Err),
261
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, CJ_DICT)?
2✔
262
                .map(DataPayload::cast),
UNCOV
263
        })
×
264
    }
2✔
265

266
    #[cfg(feature = "compiled_data")]
267
    pub(crate) fn new_southeast_asian() -> Self {
89✔
268
        #[allow(clippy::unwrap_used)]
269
        // try_load is infallible if the provider only returns `MissingLocale`.
270
        Self {
73✔
271
            grapheme: DataPayload::from_static_ref(
93✔
272
                crate::provider::Baked::SINGLETON_GRAPHEME_CLUSTER_BREAK_DATA_V2_MARKER,
273
            ),
274
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
89✔
275
                &crate::provider::Baked,
276
                MY_DICT,
277
            )
278
            .unwrap()
279
            .map(DataPayload::cast)
280
            .map(Ok),
281
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
87✔
282
                &crate::provider::Baked,
283
                KM_DICT,
284
            )
285
            .unwrap()
286
            .map(DataPayload::cast)
287
            .map(Ok),
288
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
81✔
289
                &crate::provider::Baked,
290
                LO_DICT,
291
            )
292
            .unwrap()
293
            .map(DataPayload::cast)
294
            .map(Ok),
295
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
77✔
296
                &crate::provider::Baked,
297
                TH_DICT,
298
            )
299
            .unwrap()
300
            .map(DataPayload::cast)
301
            .map(Ok),
302
            ja: None,
73✔
UNCOV
303
        }
×
304
    }
73✔
305

306
    pub(crate) fn try_new_southeast_asian<D>(provider: &D) -> Result<Self, DataError>
1✔
307
    where
308
        D: DataProvider<DictionaryForWordLineExtendedV1Marker>
309
            + DataProvider<GraphemeClusterBreakDataV2Marker>
310
            + ?Sized,
311
    {
312
        Ok(Self {
1✔
313
            grapheme: provider.load(Default::default())?.payload,
1✔
314
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, MY_DICT)?
1✔
315
                .map(DataPayload::cast)
316
                .map(Ok),
317
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, KM_DICT)?
1✔
318
                .map(DataPayload::cast)
319
                .map(Ok),
320
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, LO_DICT)?
1✔
321
                .map(DataPayload::cast)
322
                .map(Ok),
323
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, TH_DICT)?
1✔
324
                .map(DataPayload::cast)
325
                .map(Ok),
326
            ja: None,
1✔
UNCOV
327
        })
×
328
    }
1✔
329
}
330

331
fn try_load<M: DataMarker, P: DataProvider<M> + ?Sized>(
546✔
332
    provider: &P,
333
    model: &'static DataMarkerAttributes,
334
) -> Result<Option<DataPayload<M>>, DataError> {
335
    provider
546✔
336
        .load(DataRequest {
546✔
337
            id: DataIdentifierBorrowed::for_marker_attributes(model),
546✔
338
            metadata: {
339
                let mut m = DataRequestMetadata::default();
546✔
340
                m.silent = true;
546✔
341
                m.attributes_prefix_match = true;
546✔
342
                m
546✔
343
            },
344
        })
345
        .allow_identifier_not_found()
346
        .map(|r| r.map(|r| r.payload))
1,102✔
347
}
546✔
348

349
/// Return UTF-16 segment offset array using dictionary or lstm segmenter.
350
pub(crate) fn complex_language_segment_utf16(
36✔
351
    payloads: &ComplexPayloads,
352
    input: &[u16],
353
) -> Vec<usize> {
354
    let mut result = Vec::new();
36✔
355
    let mut offset = 0;
36✔
356
    for (slice, lang) in LanguageIteratorUtf16::new(input) {
72✔
357
        match payloads.select(lang) {
36✔
358
            Some(Ok(dict)) => {
12✔
359
                result.extend(
12✔
360
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
12✔
361
                        .segment_utf16(slice)
362
                        .map(|n| offset + n),
41✔
363
                );
364
            }
365
            #[cfg(feature = "lstm")]
366
            Some(Err(lstm)) => {
24✔
367
                result.extend(
24✔
368
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
24✔
369
                        .segment_utf16(slice)
370
                        .map(|n| offset + n),
87✔
371
                );
372
            }
373
            #[cfg(not(feature = "lstm"))]
374
            Some(Err(_infallible)) => {} // should be refutable
375
            None => {
UNCOV
376
                result.push(offset + slice.len());
×
377
            }
378
        }
379
        offset += slice.len();
36✔
380
    }
381
    result
34✔
382
}
34✔
383

384
/// Return UTF-8 segment offset array using dictionary or lstm segmenter.
385
pub(crate) fn complex_language_segment_str(payloads: &ComplexPayloads, input: &str) -> Vec<usize> {
42✔
386
    let mut result = Vec::new();
42✔
387
    let mut offset = 0;
42✔
388
    for (slice, lang) in LanguageIterator::new(input) {
84✔
389
        match payloads.select(lang) {
42✔
390
            Some(Ok(dict)) => {
18✔
391
                result.extend(
18✔
392
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
18✔
393
                        .segment_str(slice)
394
                        .map(|n| offset + n),
54✔
395
                );
396
            }
397
            #[cfg(feature = "lstm")]
398
            Some(Err(lstm)) => {
23✔
399
                result.extend(
23✔
400
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
23✔
401
                        .segment_str(slice)
402
                        .map(|n| offset + n),
84✔
403
                );
404
            }
405
            #[cfg(not(feature = "lstm"))]
406
            Some(Err(_infallible)) => {} // should be refutable
407
            None => {
408
                result.push(offset + slice.len());
1✔
409
            }
410
        }
411
        offset += slice.len();
42✔
412
    }
413
    result
42✔
414
}
42✔
415

416
#[cfg(test)]
417
#[cfg(feature = "serde")]
418
mod tests {
419
    use super::*;
420

421
    #[test]
422
    fn thai_word_break() {
2✔
423
        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
424
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1✔
425

426
        let lstm = ComplexPayloads::new_lstm();
1✔
427
        let dict = ComplexPayloads::new_dict();
1✔
428

429
        assert_eq!(
2✔
430
            complex_language_segment_str(&lstm, TEST_STR),
1✔
431
            [12, 21, 33, 42]
432
        );
433
        assert_eq!(
2✔
434
            complex_language_segment_utf16(&lstm, &utf16),
1✔
435
            [4, 7, 11, 14]
436
        );
437

438
        assert_eq!(
2✔
439
            complex_language_segment_str(&dict, TEST_STR),
1✔
440
            [12, 21, 33, 42]
441
        );
442
        assert_eq!(
2✔
443
            complex_language_segment_utf16(&dict, &utf16),
1✔
444
            [4, 7, 11, 14]
445
        );
446
    }
2✔
447
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc