• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 6815798908

09 Nov 2023 05:17PM CUT coverage: 72.607% (-2.4%) from 75.01%
6815798908

push

github

web-flow
Implement `Any/BufferProvider` for some smart pointers (#4255)

Allows storing them as a `Box<dyn Any/BufferProvider>` without using a
wrapper type that implements the trait.

44281 of 60987 relevant lines covered (72.61%)

201375.86 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.59
/components/segmenter/src/complex/mod.rs
1
// This file is part of ICU4X. For terms of use, please see the file
×
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::provider::*;
6
use alloc::vec::Vec;
7
use icu_locid::{locale, Locale};
8
use icu_provider::prelude::*;
9

10
mod dictionary;
11
use dictionary::*;
12
mod language;
13
use language::*;
14
#[cfg(feature = "lstm")]
15
mod lstm;
16
#[cfg(feature = "lstm")]
17
use lstm::*;
18

19
#[cfg(not(feature = "lstm"))]
20
type DictOrLstm = Result<DataPayload<UCharDictionaryBreakDataV1Marker>, core::convert::Infallible>;
21
#[cfg(not(feature = "lstm"))]
22
type DictOrLstmBorrowed<'a> =
23
    Result<&'a DataPayload<UCharDictionaryBreakDataV1Marker>, &'a core::convert::Infallible>;
24

25
#[cfg(feature = "lstm")]
26
type DictOrLstm =
27
    Result<DataPayload<UCharDictionaryBreakDataV1Marker>, DataPayload<LstmDataV1Marker>>;
28
#[cfg(feature = "lstm")]
29
type DictOrLstmBorrowed<'a> =
30
    Result<&'a DataPayload<UCharDictionaryBreakDataV1Marker>, &'a DataPayload<LstmDataV1Marker>>;
31

32
#[derive(Debug)]
×
33
pub(crate) struct ComplexPayloads {
34
    grapheme: DataPayload<GraphemeClusterBreakDataV1Marker>,
×
35
    my: Option<DictOrLstm>,
×
36
    km: Option<DictOrLstm>,
×
37
    lo: Option<DictOrLstm>,
×
38
    th: Option<DictOrLstm>,
×
39
    ja: Option<DataPayload<UCharDictionaryBreakDataV1Marker>>,
×
40
}
41

42
impl ComplexPayloads {
43
    fn select(&self, language: Language) -> Option<DictOrLstmBorrowed> {
78✔
44
        const ERR: DataError = DataError::custom("No segmentation model for language");
45
        match language {
78✔
46
            Language::Burmese => self.my.as_ref().map(Result::as_ref).or_else(|| {
5✔
47
                ERR.with_display_context("my");
×
48
                None
×
49
            }),
×
50
            Language::Khmer => self.km.as_ref().map(Result::as_ref).or_else(|| {
6✔
51
                ERR.with_display_context("km");
×
52
                None
×
53
            }),
×
54
            Language::Lao => self.lo.as_ref().map(Result::as_ref).or_else(|| {
6✔
55
                ERR.with_display_context("lo");
×
56
                None
×
57
            }),
×
58
            Language::Thai => self.th.as_ref().map(Result::as_ref).or_else(|| {
49✔
59
                ERR.with_display_context("th");
×
60
                None
×
61
            }),
×
62
            Language::ChineseOrJapanese => self.ja.as_ref().map(Ok).or_else(|| {
13✔
63
                ERR.with_display_context("ja");
1✔
64
                None
1✔
65
            }),
1✔
66
            Language::Unknown => None,
×
67
        }
68
    }
78✔
69

70
    #[cfg(feature = "lstm")]
71
    #[cfg(feature = "compiled_data")]
72
    pub(crate) fn new_lstm() -> Self {
15✔
73
        #[allow(clippy::unwrap_used)]
74
        // try_load is infallible if the provider only returns `MissingLocale`.
75
        Self {
15✔
76
            grapheme: DataPayload::from_static_ref(
15✔
77
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
78
            ),
79
            my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("my"))
15✔
80
                .unwrap()
81
                .map(DataPayload::cast)
82
                .map(Err),
83
            km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("km"))
15✔
84
                .unwrap()
85
                .map(DataPayload::cast)
86
                .map(Err),
87
            lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("lo"))
15✔
88
                .unwrap()
89
                .map(DataPayload::cast)
90
                .map(Err),
91
            th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("th"))
15✔
92
                .unwrap()
93
                .map(DataPayload::cast)
94
                .map(Err),
95
            ja: None,
15✔
96
        }
×
97
    }
15✔
98

99
    #[cfg(feature = "lstm")]
100
    pub(crate) fn try_new_lstm<D>(provider: &D) -> Result<Self, DataError>
1✔
101
    where
102
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
103
            + DataProvider<LstmForWordLineAutoV1Marker>
104
            + ?Sized,
105
    {
106
        Ok(Self {
1✔
107
            grapheme: provider.load(Default::default())?.take_payload()?,
1✔
108
            my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("my"))?
1✔
109
                .map(DataPayload::cast)
110
                .map(Err),
111
            km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("km"))?
1✔
112
                .map(DataPayload::cast)
113
                .map(Err),
114
            lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("lo"))?
1✔
115
                .map(DataPayload::cast)
116
                .map(Err),
117
            th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("th"))?
1✔
118
                .map(DataPayload::cast)
119
                .map(Err),
120
            ja: None,
1✔
121
        })
×
122
    }
1✔
123

124
    #[cfg(feature = "compiled_data")]
125
    pub(crate) fn new_dict() -> Self {
6✔
126
        #[allow(clippy::unwrap_used)]
127
        // try_load is infallible if the provider only returns `MissingLocale`.
128
        Self {
6✔
129
            grapheme: DataPayload::from_static_ref(
6✔
130
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
131
            ),
132
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
6✔
133
                &crate::provider::Baked,
134
                locale!("my"),
6✔
135
            )
136
            .unwrap()
137
            .map(DataPayload::cast)
138
            .map(Ok),
139
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
6✔
140
                &crate::provider::Baked,
141
                locale!("km"),
6✔
142
            )
143
            .unwrap()
144
            .map(DataPayload::cast)
145
            .map(Ok),
146
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
6✔
147
                &crate::provider::Baked,
148
                locale!("lo"),
6✔
149
            )
150
            .unwrap()
151
            .map(DataPayload::cast)
152
            .map(Ok),
153
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
6✔
154
                &crate::provider::Baked,
155
                locale!("th"),
6✔
156
            )
157
            .unwrap()
158
            .map(DataPayload::cast)
159
            .map(Ok),
160
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(
6✔
161
                &crate::provider::Baked,
162
                locale!("ja"),
6✔
163
            )
164
            .unwrap()
165
            .map(DataPayload::cast),
166
        }
×
167
    }
6✔
168

169
    pub(crate) fn try_new_dict<D>(provider: &D) -> Result<Self, DataError>
×
170
    where
171
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
172
            + DataProvider<DictionaryForWordLineExtendedV1Marker>
173
            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
174
            + ?Sized,
175
    {
176
        Ok(Self {
×
177
            grapheme: provider.load(Default::default())?.take_payload()?,
×
178
            my: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("my"))?
×
179
                .map(DataPayload::cast)
180
                .map(Ok),
181
            km: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("km"))?
×
182
                .map(DataPayload::cast)
183
                .map(Ok),
184
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("lo"))?
×
185
                .map(DataPayload::cast)
186
                .map(Ok),
187
            th: try_load::<DictionaryForWordLineExtendedV1Marker, D>(provider, locale!("th"))?
×
188
                .map(DataPayload::cast)
189
                .map(Ok),
190
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, locale!("ja"))?
×
191
                .map(DataPayload::cast),
192
        })
×
193
    }
×
194

195
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
196
    #[cfg(feature = "compiled_data")]
197
    pub(crate) fn new_auto() -> Self {
15✔
198
        #[allow(clippy::unwrap_used)]
199
        // try_load is infallible if the provider only returns `MissingLocale`.
200
        Self {
5✔
201
            grapheme: DataPayload::from_static_ref(
17✔
202
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
203
            ),
204
            my: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("my"))
15✔
205
                .unwrap()
206
                .map(DataPayload::cast)
207
                .map(Err),
208
            km: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("km"))
15✔
209
                .unwrap()
210
                .map(DataPayload::cast)
211
                .map(Err),
212
            lo: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("lo"))
11✔
213
                .unwrap()
214
                .map(DataPayload::cast)
215
                .map(Err),
216
            th: try_load::<LstmForWordLineAutoV1Marker, _>(&crate::provider::Baked, locale!("th"))
7✔
217
                .unwrap()
218
                .map(DataPayload::cast)
219
                .map(Err),
220
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, _>(
5✔
221
                &crate::provider::Baked,
222
                locale!("ja"),
21✔
223
            )
224
            .unwrap()
225
            .map(DataPayload::cast),
226
        }
×
227
    }
5✔
228

229
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
230
    pub(crate) fn try_new_auto<D>(provider: &D) -> Result<Self, DataError>
×
231
    where
232
        D: DataProvider<GraphemeClusterBreakDataV1Marker>
233
            + DataProvider<LstmForWordLineAutoV1Marker>
234
            + DataProvider<DictionaryForWordOnlyAutoV1Marker>
235
            + ?Sized,
236
    {
237
        Ok(Self {
×
238
            grapheme: provider.load(Default::default())?.take_payload()?,
×
239
            my: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("my"))?
×
240
                .map(DataPayload::cast)
241
                .map(Err),
242
            km: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("km"))?
×
243
                .map(DataPayload::cast)
244
                .map(Err),
245
            lo: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("lo"))?
×
246
                .map(DataPayload::cast)
247
                .map(Err),
248
            th: try_load::<LstmForWordLineAutoV1Marker, D>(provider, locale!("th"))?
×
249
                .map(DataPayload::cast)
250
                .map(Err),
251
            ja: try_load::<DictionaryForWordOnlyAutoV1Marker, D>(provider, locale!("ja"))?
×
252
                .map(DataPayload::cast),
253
        })
×
254
    }
×
255

256
    #[cfg(feature = "compiled_data")]
257
    pub(crate) fn new_southeast_asian() -> Self {
90✔
258
        #[allow(clippy::unwrap_used)]
259
        // try_load is infallible if the provider only returns `MissingLocale`.
260
        Self {
64✔
261
            grapheme: DataPayload::from_static_ref(
90✔
262
                crate::provider::Baked::SINGLETON_SEGMENTER_GRAPHEME_V1,
263
            ),
264
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
90✔
265
                &crate::provider::Baked,
266
                locale!("my"),
90✔
267
            )
268
            .unwrap()
269
            .map(DataPayload::cast)
270
            .map(Ok),
271
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
78✔
272
                &crate::provider::Baked,
273
                locale!("km"),
94✔
274
            )
275
            .unwrap()
276
            .map(DataPayload::cast)
277
            .map(Ok),
278
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
72✔
279
                &crate::provider::Baked,
280
                locale!("lo"),
102✔
281
            )
282
            .unwrap()
283
            .map(DataPayload::cast)
284
            .map(Ok),
285
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(
64✔
286
                &crate::provider::Baked,
287
                locale!("th"),
102✔
288
            )
289
            .unwrap()
290
            .map(DataPayload::cast)
291
            .map(Ok),
292
            ja: None,
64✔
293
        }
×
294
    }
64✔
295

296
    pub(crate) fn try_new_southeast_asian<D>(provider: &D) -> Result<Self, DataError>
1✔
297
    where
298
        D: DataProvider<DictionaryForWordLineExtendedV1Marker>
299
            + DataProvider<GraphemeClusterBreakDataV1Marker>
300
            + ?Sized,
301
    {
302
        Ok(Self {
1✔
303
            grapheme: provider.load(Default::default())?.take_payload()?,
1✔
304
            my: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("my"))?
1✔
305
                .map(DataPayload::cast)
306
                .map(Ok),
307
            km: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("km"))?
1✔
308
                .map(DataPayload::cast)
309
                .map(Ok),
310
            lo: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("lo"))?
1✔
311
                .map(DataPayload::cast)
312
                .map(Ok),
313
            th: try_load::<DictionaryForWordLineExtendedV1Marker, _>(provider, locale!("th"))?
1✔
314
                .map(DataPayload::cast)
315
                .map(Ok),
316
            ja: None,
1✔
317
        })
×
318
    }
1✔
319
}
320

321
fn try_load<M: KeyedDataMarker, P: DataProvider<M> + ?Sized>(
523✔
322
    provider: &P,
323
    locale: Locale,
324
) -> Result<Option<DataPayload<M>>, DataError> {
325
    match provider.load(DataRequest {
517✔
326
        locale: &DataLocale::from(locale),
523✔
327
        metadata: {
328
            let mut m = DataRequestMetadata::default();
523✔
329
            m.silent = true;
529✔
330
            m
331
        },
332
    }) {
333
        Ok(response) => Ok(Some(response.take_payload()?)),
523✔
334
        Err(DataError {
335
            kind: DataErrorKind::MissingLocale,
336
            ..
337
        }) => Ok(None),
×
338
        Err(e) => Err(e),
×
339
    }
340
}
517✔
341

342
/// Return UTF-16 segment offset array using dictionary or lstm segmenter.
343
pub(crate) fn complex_language_segment_utf16(
36✔
344
    payloads: &ComplexPayloads,
345
    input: &[u16],
346
) -> Vec<usize> {
347
    let mut result = Vec::new();
36✔
348
    let mut offset = 0;
36✔
349
    for (slice, lang) in LanguageIteratorUtf16::new(input) {
72✔
350
        match payloads.select(lang) {
36✔
351
            Some(Ok(dict)) => {
12✔
352
                result.extend(
12✔
353
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
12✔
354
                        .segment_utf16(slice)
355
                        .map(|n| offset + n),
53✔
356
                );
357
            }
358
            #[cfg(feature = "lstm")]
359
            Some(Err(lstm)) => {
24✔
360
                result.extend(
24✔
361
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
24✔
362
                        .segment_utf16(slice)
363
                        .map(|n| offset + n),
111✔
364
                );
365
            }
366
            #[cfg(not(feature = "lstm"))]
367
            Some(Err(_infallible)) => {} // should be refutable
368
            None => {
369
                result.push(offset + slice.len());
×
370
            }
371
        }
372
        offset += slice.len();
36✔
373
    }
374
    result
375
}
36✔
376

377
/// Return UTF-8 segment offset array using dictionary or lstm segmenter.
378
pub(crate) fn complex_language_segment_str(payloads: &ComplexPayloads, input: &str) -> Vec<usize> {
42✔
379
    let mut result = Vec::new();
42✔
380
    let mut offset = 0;
42✔
381
    for (slice, lang) in LanguageIterator::new(input) {
84✔
382
        match payloads.select(lang) {
42✔
383
            Some(Ok(dict)) => {
18✔
384
                result.extend(
18✔
385
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
18✔
386
                        .segment_str(slice)
387
                        .map(|n| offset + n),
72✔
388
                );
389
            }
390
            #[cfg(feature = "lstm")]
391
            Some(Err(lstm)) => {
23✔
392
                result.extend(
23✔
393
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
23✔
394
                        .segment_str(slice)
395
                        .map(|n| offset + n),
107✔
396
                );
397
            }
398
            #[cfg(not(feature = "lstm"))]
399
            Some(Err(_infallible)) => {} // should be refutable
400
            None => {
401
                result.push(offset + slice.len());
1✔
402
            }
403
        }
404
        offset += slice.len();
42✔
405
    }
406
    result
407
}
40✔
408

409
#[cfg(test)]
410
#[cfg(feature = "serde")]
411
mod tests {
412
    use super::*;
413

414
    #[test]
415
    fn thai_word_break() {
2✔
416
        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
417
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1✔
418

419
        let lstm = ComplexPayloads::new_lstm();
1✔
420
        let dict = ComplexPayloads::new_dict();
1✔
421

422
        assert_eq!(
1✔
423
            complex_language_segment_str(&lstm, TEST_STR),
1✔
424
            [12, 21, 33, 42]
425
        );
426
        assert_eq!(
1✔
427
            complex_language_segment_utf16(&lstm, &utf16),
1✔
428
            [4, 7, 11, 14]
429
        );
430

431
        assert_eq!(
1✔
432
            complex_language_segment_str(&dict, TEST_STR),
1✔
433
            [12, 21, 33, 42]
434
        );
435
        assert_eq!(
1✔
436
            complex_language_segment_utf16(&dict, &utf16),
1✔
437
            [4, 7, 11, 14]
438
        );
439
    }
2✔
440
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc