• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 9014530096

08 May 2024 07:27PM UTC coverage: 76.402% (+0.2%) from 76.234%
9014530096

push

github

web-flow
Add missing std pointer-like impls for DataProvider, DynamicDataProvider (#4880)

0 of 3 new or added lines in 1 file covered. (0.0%)

3218 existing lines in 167 files now uncovered.

53328 of 69799 relevant lines covered (76.4%)

504343.42 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.59
/components/segmenter/src/complex/mod.rs
1
// This file is part of ICU4X. For terms of use, please see the file
×
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::provider::*;
6
use alloc::vec::Vec;
7
use icu_locid::{locale, Locale};
8
use icu_provider::prelude::*;
9

10
mod dictionary;
11
use dictionary::*;
12
mod language;
13
use language::*;
14
#[cfg(feature = "lstm")]
15
mod lstm;
16
#[cfg(feature = "lstm")]
17
use lstm::*;
18

19
#[cfg(not(feature = "lstm"))]
20
type DictOrLstm = Result<DataPayload<UCharDictionaryBreakDataV1Marker>, core::convert::Infallible>;
21
#[cfg(not(feature = "lstm"))]
22
type DictOrLstmBorrowed<'a> =
23
    Result<&'a DataPayload<UCharDictionaryBreakDataV1Marker>, &'a core::convert::Infallible>;
24

25
#[cfg(feature = "lstm")]
26
type DictOrLstm =
27
    Result<DataPayload<UCharDictionaryBreakDataV1Marker>, DataPayload<LstmForWordLineAutoV1Marker>>;
28
#[cfg(feature = "lstm")]
29
type DictOrLstmBorrowed<'a> = Result<
30
    &'a DataPayload<UCharDictionaryBreakDataV1Marker>,
31
    &'a DataPayload<LstmForWordLineAutoV1Marker>,
32
>;
33

34
#[derive(Debug)]
×
35
pub(crate) struct ComplexPayloads {
36
    grapheme: DataPayload<GraphemeClusterBreakDataV1Marker>,
×
37
    my: Option<DictOrLstm>,
×
38
    km: Option<DictOrLstm>,
×
39
    lo: Option<DictOrLstm>,
×
UNCOV
40
    th: Option<DictOrLstm>,
×
UNCOV
41
    ja: Option<DataPayload<UCharDictionaryBreakDataV1Marker>>,
×
42
}
43

44
impl ComplexPayloads {
45
    fn select(&self, language: Language) -> Option<DictOrLstmBorrowed> {
78✔
46
        const ERR: DataError = DataError::custom("No segmentation model for language");
47
        match language {
78✔
48
            Language::Burmese => self.my.as_ref().map(Result::as_ref).or_else(|| {
5✔
49
                ERR.with_display_context("my");
×
UNCOV
50
                None
×
51
            }),
×
52
            Language::Khmer => self.km.as_ref().map(Result::as_ref).or_else(|| {
6✔
53
                ERR.with_display_context("km");
×
UNCOV
54
                None
×
55
            }),
×
56
            Language::Lao => self.lo.as_ref().map(Result::as_ref).or_else(|| {
6✔
57
                ERR.with_display_context("lo");
×
UNCOV
58
                None
×
59
            }),
×
60
            Language::Thai => self.th.as_ref().map(Result::as_ref).or_else(|| {
49✔
61
                ERR.with_display_context("th");
×
UNCOV
62
                None
×
UNCOV
63
            }),
×
64
            Language::ChineseOrJapanese => self.ja.as_ref().map(Ok).or_else(|| {
13✔
65
                ERR.with_display_context("ja");
1✔
66
                None
1✔
67
            }),
1✔
UNCOV
68
            Language::Unknown => None,
×
69
        }
70
    }
78✔
71

72
    #[cfg(feature = "lstm")]
73
    #[cfg(feature = "compiled_data")]
74
    pub(crate) fn new_lstm() -> Self {
15✔
75
        #[allow(clippy::unwrap_used)]
76
        // try_load is infallible if the provider only returns `MissingLocale`.
77
        Self {
15✔
78
            grapheme: DataPayload::from_static_ref(
15✔
79
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
80
            ),
81
            my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("my"))
15✔
82
                .unwrap()
83
                .map(DataPayload::cast)
84
                .map(Err),
85
            km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("km"))
15✔
86
                .unwrap()
87
                .map(DataPayload::cast)
88
                .map(Err),
89
            lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("lo"))
15✔
90
                .unwrap()
91
                .map(DataPayload::cast)
92
                .map(Err),
93
            th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("th"))
15✔
94
                .unwrap()
95
                .map(DataPayload::cast)
96
                .map(Err),
97
            ja: None,
15✔
UNCOV
98
        }
×
99
    }
15✔
100

101
    #[cfg(feature = "lstm")]
102
    pub(crate) fn try_new_lstm<D>(provider: &D) -> Result<Self, DataError>
1✔
103
    where
104
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
105
            + DataProvider<LstmForWordLineAutoV1Marker>
106
            + ?Sized,
107
    {
108
        Ok(Self {
1✔
109
            grapheme: provider.load(Default::default())?.take_payload()?,
1✔
110
            my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("my"))?
1✔
111
                .map(DataPayload::cast)
112
                .map(Err),
113
            km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("km"))?
1✔
114
                .map(DataPayload::cast)
115
                .map(Err),
116
            lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("lo"))?
1✔
117
                .map(DataPayload::cast)
118
                .map(Err),
119
            th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("th"))?
1✔
120
                .map(DataPayload::cast)
121
                .map(Err),
122
            ja: None,
1✔
UNCOV
123
        })
×
124
    }
1✔
125

126
    #[cfg(feature = "compiled_data")]
127
    pub(crate) fn new_dict() -> Self {
7✔
128
        #[allow(clippy::unwrap_used)]
129
        // try_load is infallible if the provider only returns `MissingLocale`.
130
        Self {
7✔
131
            grapheme: DataPayload::from_static_ref(
7✔
132
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
133
            ),
134
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
7✔
135
                &crate::provider::Baked,
136
                locale!("my"),
7✔
137
            )
138
            .unwrap()
139
            .map(DataPayload::cast)
140
            .map(Ok),
141
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
7✔
142
                &crate::provider::Baked,
143
                locale!("km"),
7✔
144
            )
145
            .unwrap()
146
            .map(DataPayload::cast)
147
            .map(Ok),
148
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
7✔
149
                &crate::provider::Baked,
150
                locale!("lo"),
7✔
151
            )
152
            .unwrap()
153
            .map(DataPayload::cast)
154
            .map(Ok),
155
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
7✔
156
                &crate::provider::Baked,
157
                locale!("th"),
7✔
158
            )
159
            .unwrap()
160
            .map(DataPayload::cast)
161
            .map(Ok),
162
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(
7✔
163
                &crate::provider::Baked,
164
                locale!("ja"),
7✔
165
            )
166
            .unwrap()
167
            .map(DataPayload::cast),
UNCOV
168
        }
×
169
    }
7✔
170

UNCOV
171
    pub(crate) fn try_new_dict<D>(provider: &D) -> Result<Self, DataError>
×
172
    where
173
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
174
            + DataProvider<DictionaryForWordLineExtendedV1Marker>
175
            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
176
            + ?Sized,
177
    {
178
        Ok(Self {
×
UNCOV
179
            grapheme: provider.load(Default::default())?.take_payload()?,
×
UNCOV
180
            my: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("my"))?
×
181
                .map(DataPayload::cast)
182
                .map(Ok),
UNCOV
183
            km: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("km"))?
×
184
                .map(DataPayload::cast)
185
                .map(Ok),
UNCOV
186
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("lo"))?
×
187
                .map(DataPayload::cast)
188
                .map(Ok),
UNCOV
189
            th: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("th"))?
×
190
                .map(DataPayload::cast)
191
                .map(Ok),
192
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, locale!("ja"))?
×
193
                .map(DataPayload::cast),
UNCOV
194
        })
×
UNCOV
195
    }
×
196

197
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
198
    #[cfg(feature = "compiled_data")]
199
    pub(crate) fn new_auto() -> Self {
15✔
200
        #[allow(clippy::unwrap_used)]
201
        // try_load is infallible if the provider only returns `MissingLocale`.
202
        Self {
11✔
203
            grapheme: DataPayload::from_static_ref(
15✔
204
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
205
            ),
206
            my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("my"))
15✔
207
                .unwrap()
208
                .map(DataPayload::cast)
209
                .map(Err),
210
            km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("km"))
13✔
211
                .unwrap()
212
                .map(DataPayload::cast)
213
                .map(Err),
214
            lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("lo"))
13✔
215
                .unwrap()
216
                .map(DataPayload::cast)
217
                .map(Err),
218
            th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("th"))
13✔
219
                .unwrap()
220
                .map(DataPayload::cast)
221
                .map(Err),
222
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(
13✔
223
                &crate::provider::Baked,
224
                locale!("ja"),
19✔
225
            )
226
            .unwrap()
227
            .map(DataPayload::cast),
UNCOV
228
        }
×
229
    }
11✔
230

231
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
UNCOV
232
    pub(crate) fn try_new_auto<D>(provider: &D) -> Result<Self, DataError>
×
233
    where
234
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
235
            + DataProvider<LstmForWordLineAutoV1Marker>
236
            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
237
            + ?Sized,
238
    {
239
        Ok(Self {
×
UNCOV
240
            grapheme: provider.load(Default::default())?.take_payload()?,
×
UNCOV
241
            my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("my"))?
×
242
                .map(DataPayload::cast)
243
                .map(Err),
UNCOV
244
            km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("km"))?
×
245
                .map(DataPayload::cast)
246
                .map(Err),
UNCOV
247
            lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("lo"))?
×
248
                .map(DataPayload::cast)
249
                .map(Err),
UNCOV
250
            th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("th"))?
×
251
                .map(DataPayload::cast)
252
                .map(Err),
253
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, locale!("ja"))?
×
254
                .map(DataPayload::cast),
UNCOV
255
        })
×
UNCOV
256
    }
×
257

258
    #[cfg(feature = "compiled_data")]
259
    pub(crate) fn new_southeast_asian() -> Self {
89✔
260
        #[allow(clippy::unwrap_used)]
261
        // try_load is infallible if the provider only returns `MissingLocale`.
262
        Self {
81✔
263
            grapheme: DataPayload::from_static_ref(
91✔
264
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
265
            ),
266
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
89✔
267
                &crate::provider::Baked,
268
                locale!("my"),
91✔
269
            )
270
            .unwrap()
271
            .map(DataPayload::cast)
272
            .map(Ok),
273
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
85✔
274
                &crate::provider::Baked,
275
                locale!("km"),
89✔
276
            )
277
            .unwrap()
278
            .map(DataPayload::cast)
279
            .map(Ok),
280
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
85✔
281
                &crate::provider::Baked,
282
                locale!("lo"),
89✔
283
            )
284
            .unwrap()
285
            .map(DataPayload::cast)
286
            .map(Ok),
287
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
83✔
288
                &crate::provider::Baked,
289
                locale!("th"),
93✔
290
            )
291
            .unwrap()
292
            .map(DataPayload::cast)
293
            .map(Ok),
294
            ja: None,
81✔
UNCOV
295
        }
×
296
    }
81✔
297

298
    pub(crate) fn try_new_southeast_asian<D>(provider: &D) -> Result<Self, DataError>
1✔
299
    where
300
        D: DataProvider<DictionaryForWordLineExtendedV1Marker>
301
            + DataProvider<GraphemeClusterBreakDataV1Marker>
302
            + ?Sized,
303
    {
304
        Ok(Self {
1✔
305
            grapheme: provider.load(Default::default())?.take_payload()?,
1✔
306
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("my"))?
1✔
307
                .map(DataPayload::cast)
308
                .map(Ok),
309
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("km"))?
1✔
310
                .map(DataPayload::cast)
311
                .map(Ok),
312
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("lo"))?
1✔
313
                .map(DataPayload::cast)
314
                .map(Ok),
315
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("th"))?
1✔
316
                .map(DataPayload::cast)
317
                .map(Ok),
318
            ja: None,
1✔
UNCOV
319
        })
×
320
    }
1✔
321
}
322

323
fn try_load<M: KeyedDataMarker, P: DataProvider<M> + ?Sized>(
538✔
324
    provider: &P,
325
    locale: Locale,
326
) -> Result<Option<DataPayload<M>>, DataError> {
327
    match provider.load(DataRequest {
534✔
328
        locale: &DataLocale::from(locale),
538✔
329
        metadata: {
330
            let mut m = DataRequestMetadata::default();
538✔
331
            m.silent = true;
542✔
332
            m
333
        },
334
    }) {
335
        Ok(response) => Ok(Some(response.take_payload()?)),
538✔
336
        Err(DataError {
337
            kind: DataErrorKind::MissingLocale,
338
            ..
UNCOV
339
        }) => Ok(None),
×
UNCOV
340
        Err(e) => Err(e),
×
341
    }
342
}
538✔
343

344
/// Return UTF-16 segment offset array using dictionary or lstm segmenter.
345
pub(crate) fn complex_language_segment_utf16(
36✔
346
    payloads: &ComplexPayloads,
347
    input: &[u16],
348
) -> Vec<usize> {
349
    let mut result = Vec::new();
36✔
350
    let mut offset = 0;
36✔
351
    for (slice, lang) in LanguageIteratorUtf16::new(input) {
72✔
352
        match payloads.select(lang) {
36✔
353
            Some(Ok(dict)) => {
12✔
354
                result.extend(
12✔
355
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
12✔
356
                        .segment_utf16(slice)
357
                        .map(|n| offset + n),
53✔
358
                );
359
            }
360
            #[cfg(feature = "lstm")]
361
            Some(Err(lstm)) => {
24✔
362
                result.extend(
24✔
363
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
24✔
364
                        .segment_utf16(slice)
365
                        .map(|n| offset + n),
111✔
366
                );
367
            }
368
            #[cfg(not(feature = "lstm"))]
369
            Some(Err(_infallible)) => {} // should be refutable
370
            None => {
UNCOV
371
                result.push(offset + slice.len());
×
372
            }
373
        }
374
        offset += slice.len();
36✔
375
    }
376
    result
377
}
36✔
378

379
/// Return UTF-8 segment offset array using dictionary or lstm segmenter.
380
pub(crate) fn complex_language_segment_str(payloads: &ComplexPayloads, input: &str) -> Vec<usize> {
42✔
381
    let mut result = Vec::new();
42✔
382
    let mut offset = 0;
42✔
383
    for (slice, lang) in LanguageIterator::new(input) {
84✔
384
        match payloads.select(lang) {
42✔
385
            Some(Ok(dict)) => {
18✔
386
                result.extend(
18✔
387
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
18✔
388
                        .segment_str(slice)
389
                        .map(|n| offset + n),
72✔
390
                );
391
            }
392
            #[cfg(feature = "lstm")]
393
            Some(Err(lstm)) => {
23✔
394
                result.extend(
23✔
395
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
23✔
396
                        .segment_str(slice)
397
                        .map(|n| offset + n),
107✔
398
                );
399
            }
400
            #[cfg(not(feature = "lstm"))]
401
            Some(Err(_infallible)) => {} // should be refutable
402
            None => {
403
                result.push(offset + slice.len());
1✔
404
            }
405
        }
406
        offset += slice.len();
42✔
407
    }
408
    result
409
}
42✔
410

411
#[cfg(test)]
412
#[cfg(feature = "serde")]
413
mod tests {
414
    use super::*;
415

416
    #[test]
417
    fn thai_word_break() {
2✔
418
        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
419
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1✔
420

421
        let lstm = ComplexPayloads::new_lstm();
1✔
422
        let dict = ComplexPayloads::new_dict();
1✔
423

424
        assert_eq!(
1✔
425
            complex_language_segment_str(&lstm, TEST_STR),
1✔
426
            [12, 21, 33, 42]
427
        );
428
        assert_eq!(
1✔
429
            complex_language_segment_utf16(&lstm, &utf16),
1✔
430
            [4, 7, 11, 14]
431
        );
432

433
        assert_eq!(
1✔
434
            complex_language_segment_str(&dict, TEST_STR),
1✔
435
            [12, 21, 33, 42]
436
        );
437
        assert_eq!(
1✔
438
            complex_language_segment_utf16(&dict, &utf16),
1✔
439
            [4, 7, 11, 14]
440
        );
441
    }
2✔
442
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc