• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 13958601093

19 Mar 2025 04:17PM UTC coverage: 74.164% (-1.5%) from 75.71%
13958601093

push

github

web-flow
Clean up properties docs (#6315)

58056 of 78281 relevant lines covered (74.16%)

819371.32 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.74
/components/segmenter/src/complex/mod.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use crate::provider::*;
6
use alloc::vec::Vec;
7
use icu_provider::prelude::*;
8

9
mod dictionary;
10
use dictionary::*;
11
mod language;
12
use language::*;
13
#[cfg(feature = "lstm")]
14
mod lstm;
15
#[cfg(feature = "lstm")]
16
use lstm::*;
17

18
#[cfg(not(feature = "lstm"))]
19
type DictOrLstm = Result<DataPayload<UCharDictionaryBreakDataV1>, core::convert::Infallible>;
20
#[cfg(not(feature = "lstm"))]
21
type DictOrLstmBorrowed<'a> =
22
    Result<&'a DataPayload<UCharDictionaryBreakDataV1>, &'a core::convert::Infallible>;
23

24
#[cfg(feature = "lstm")]
25
type DictOrLstm = Result<DataPayload<UCharDictionaryBreakDataV1>, DataPayload<SegmenterLstmAutoV1>>;
26
#[cfg(feature = "lstm")]
27
type DictOrLstmBorrowed<'a> =
28
    Result<&'a DataPayload<UCharDictionaryBreakDataV1>, &'a DataPayload<SegmenterLstmAutoV1>>;
29

30
#[derive(Debug)]
×
31
pub(crate) struct ComplexPayloads {
32
    grapheme: DataPayload<SegmenterBreakGraphemeClusterV1>,
×
33
    my: Option<DictOrLstm>,
×
34
    km: Option<DictOrLstm>,
×
35
    lo: Option<DictOrLstm>,
×
36
    th: Option<DictOrLstm>,
×
37
    ja: Option<DataPayload<UCharDictionaryBreakDataV1>>,
×
38
}
39

40
#[cfg(feature = "lstm")]
41
const MY_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Burmese_");
42
#[cfg(feature = "lstm")]
43
const KM_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Khmer_");
44
#[cfg(feature = "lstm")]
45
const LO_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Lao_");
46
#[cfg(feature = "lstm")]
47
const TH_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Thai_");
48

49
const MY_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("burmesedict");
50
const KM_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("khmerdict");
51
const LO_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("laodict");
52
const TH_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("thaidict");
53
const CJ_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("cjdict");
54

55
impl ComplexPayloads {
56
    fn select(&self, language: Language) -> Option<DictOrLstmBorrowed> {
78✔
57
        const ERR: DataError = DataError::custom("No segmentation model for language");
58
        match language {
78✔
59
            Language::Burmese => self.my.as_ref().map(Result::as_ref).or_else(|| {
5✔
60
                ERR.with_display_context("my");
×
61
                None
×
62
            }),
×
63
            Language::Khmer => self.km.as_ref().map(Result::as_ref).or_else(|| {
6✔
64
                ERR.with_display_context("km");
×
65
                None
×
66
            }),
×
67
            Language::Lao => self.lo.as_ref().map(Result::as_ref).or_else(|| {
6✔
68
                ERR.with_display_context("lo");
×
69
                None
×
70
            }),
×
71
            Language::Thai => self.th.as_ref().map(Result::as_ref).or_else(|| {
49✔
72
                ERR.with_display_context("th");
×
73
                None
×
74
            }),
×
75
            Language::ChineseOrJapanese => self.ja.as_ref().map(Ok).or_else(|| {
13✔
76
                ERR.with_display_context("ja");
1✔
77
                None
1✔
78
            }),
1✔
79
            Language::Unknown => None,
×
80
        }
81
    }
78✔
82

83
    #[cfg(feature = "lstm")]
84
    #[cfg(feature = "compiled_data")]
85
    pub(crate) fn new_lstm() -> Self {
15✔
86
        #[allow(clippy::unwrap_used)]
87
        // try_load is infallible if the provider only returns `MissingLocale`.
88
        Self {
15✔
89
            grapheme: DataPayload::from_static_ref(
15✔
90
                crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_GRAPHEME_CLUSTER_V1,
91
            ),
92
            my: try_load::<SegmenterLstmAutoV1, _>(&crate::provider::Baked, MY_LSTM)
15✔
93
                .unwrap()
94
                .map(DataPayload::cast)
95
                .map(Err),
96
            km: try_load::<SegmenterLstmAutoV1, _>(&crate::provider::Baked, KM_LSTM)
15✔
97
                .unwrap()
98
                .map(DataPayload::cast)
99
                .map(Err),
100
            lo: try_load::<SegmenterLstmAutoV1, _>(&crate::provider::Baked, LO_LSTM)
15✔
101
                .unwrap()
102
                .map(DataPayload::cast)
103
                .map(Err),
104
            th: try_load::<SegmenterLstmAutoV1, _>(&crate::provider::Baked, TH_LSTM)
15✔
105
                .unwrap()
106
                .map(DataPayload::cast)
107
                .map(Err),
108
            ja: None,
15✔
109
        }
×
110
    }
15✔
111

112
    #[cfg(feature = "lstm")]
113
    pub(crate) fn try_new_lstm<D>(provider: &D) -> Result<Self, DataError>
1✔
114
    where
115
        D: DataProvider<SegmenterBreakGraphemeClusterV1>
116
            + DataProvider<SegmenterLstmAutoV1>
117
            + ?Sized,
118
    {
119
        Ok(Self {
1✔
120
            grapheme: provider.load(Default::default())?.payload,
1✔
121
            my: try_load::<SegmenterLstmAutoV1, D>(provider, MY_LSTM)?
1✔
122
                .map(DataPayload::cast)
123
                .map(Err),
124
            km: try_load::<SegmenterLstmAutoV1, D>(provider, KM_LSTM)?
1✔
125
                .map(DataPayload::cast)
126
                .map(Err),
127
            lo: try_load::<SegmenterLstmAutoV1, D>(provider, LO_LSTM)?
1✔
128
                .map(DataPayload::cast)
129
                .map(Err),
130
            th: try_load::<SegmenterLstmAutoV1, D>(provider, TH_LSTM)?
1✔
131
                .map(DataPayload::cast)
132
                .map(Err),
133
            ja: None,
1✔
134
        })
×
135
    }
1✔
136

137
    #[cfg(feature = "compiled_data")]
138
    pub(crate) fn new_dict() -> Self {
5✔
139
        #[allow(clippy::unwrap_used)]
140
        // try_load is infallible if the provider only returns `MissingLocale`.
141
        Self {
5✔
142
            grapheme: DataPayload::from_static_ref(
5✔
143
                crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_GRAPHEME_CLUSTER_V1,
144
            ),
145
            my: try_load::<SegmenterDictionaryExtendedV1, _>(&crate::provider::Baked, MY_DICT)
5✔
146
                .unwrap()
147
                .map(DataPayload::cast)
148
                .map(Ok),
149
            km: try_load::<SegmenterDictionaryExtendedV1, _>(&crate::provider::Baked, KM_DICT)
5✔
150
                .unwrap()
151
                .map(DataPayload::cast)
152
                .map(Ok),
153
            lo: try_load::<SegmenterDictionaryExtendedV1, _>(&crate::provider::Baked, LO_DICT)
5✔
154
                .unwrap()
155
                .map(DataPayload::cast)
156
                .map(Ok),
157
            th: try_load::<SegmenterDictionaryExtendedV1, _>(&crate::provider::Baked, TH_DICT)
5✔
158
                .unwrap()
159
                .map(DataPayload::cast)
160
                .map(Ok),
161
            ja: try_load::<SegmenterDictionaryAutoV1, _>(&crate::provider::Baked, CJ_DICT)
5✔
162
                .unwrap()
163
                .map(DataPayload::cast),
164
        }
×
165
    }
5✔
166

167
    pub(crate) fn try_new_dict<D>(provider: &D) -> Result<Self, DataError>
2✔
168
    where
169
        D: DataProvider<SegmenterBreakGraphemeClusterV1>
170
            + DataProvider<SegmenterDictionaryExtendedV1>
171
            + DataProvider<SegmenterDictionaryAutoV1>
172
            + ?Sized,
173
    {
174
        Ok(Self {
2✔
175
            grapheme: provider.load(Default::default())?.payload,
2✔
176
            my: try_load::<SegmenterDictionaryExtendedV1, D>(provider, MY_DICT)?
2✔
177
                .map(DataPayload::cast)
178
                .map(Ok),
179
            km: try_load::<SegmenterDictionaryExtendedV1, D>(provider, KM_DICT)?
2✔
180
                .map(DataPayload::cast)
181
                .map(Ok),
182
            lo: try_load::<SegmenterDictionaryExtendedV1, D>(provider, LO_DICT)?
2✔
183
                .map(DataPayload::cast)
184
                .map(Ok),
185
            th: try_load::<SegmenterDictionaryExtendedV1, D>(provider, TH_DICT)?
2✔
186
                .map(DataPayload::cast)
187
                .map(Ok),
188
            ja: try_load::<SegmenterDictionaryAutoV1, D>(provider, CJ_DICT)?.map(DataPayload::cast),
2✔
189
        })
×
190
    }
2✔
191

192
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
193
    #[cfg(feature = "compiled_data")]
194
    pub(crate) fn new_auto() -> Self {
16✔
195
        #[allow(clippy::unwrap_used)]
196
        // try_load is infallible if the provider only returns `MissingLocale`.
197
        Self {
6✔
198
            grapheme: DataPayload::from_static_ref(
16✔
199
                crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_GRAPHEME_CLUSTER_V1,
200
            ),
201
            my: try_load::<SegmenterLstmAutoV1, _>(&crate::provider::Baked, MY_LSTM)
16✔
202
                .unwrap()
203
                .map(DataPayload::cast)
204
                .map(Err),
205
            km: try_load::<SegmenterLstmAutoV1, _>(&crate::provider::Baked, KM_LSTM)
10✔
206
                .unwrap()
207
                .map(DataPayload::cast)
208
                .map(Err),
209
            lo: try_load::<SegmenterLstmAutoV1, _>(&crate::provider::Baked, LO_LSTM)
10✔
210
                .unwrap()
211
                .map(DataPayload::cast)
212
                .map(Err),
213
            th: try_load::<SegmenterLstmAutoV1, _>(&crate::provider::Baked, TH_LSTM)
10✔
214
                .unwrap()
215
                .map(DataPayload::cast)
216
                .map(Err),
217
            ja: try_load::<SegmenterDictionaryAutoV1, _>(&crate::provider::Baked, CJ_DICT)
8✔
218
                .unwrap()
219
                .map(DataPayload::cast),
220
        }
×
221
    }
6✔
222

223
    #[cfg(feature = "auto")] // Use by WordSegmenter with "auto" enabled.
224
    pub(crate) fn try_new_auto<D>(provider: &D) -> Result<Self, DataError>
2✔
225
    where
226
        D: DataProvider<SegmenterBreakGraphemeClusterV1>
227
            + DataProvider<SegmenterLstmAutoV1>
228
            + DataProvider<SegmenterDictionaryAutoV1>
229
            + ?Sized,
230
    {
231
        Ok(Self {
2✔
232
            grapheme: provider.load(Default::default())?.payload,
2✔
233
            my: try_load::<SegmenterLstmAutoV1, D>(provider, MY_LSTM)?
2✔
234
                .map(DataPayload::cast)
235
                .map(Err),
236
            km: try_load::<SegmenterLstmAutoV1, D>(provider, KM_LSTM)?
2✔
237
                .map(DataPayload::cast)
238
                .map(Err),
239
            lo: try_load::<SegmenterLstmAutoV1, D>(provider, LO_LSTM)?
2✔
240
                .map(DataPayload::cast)
241
                .map(Err),
242
            th: try_load::<SegmenterLstmAutoV1, D>(provider, TH_LSTM)?
2✔
243
                .map(DataPayload::cast)
244
                .map(Err),
245
            ja: try_load::<SegmenterDictionaryAutoV1, D>(provider, CJ_DICT)?.map(DataPayload::cast),
2✔
246
        })
×
247
    }
2✔
248

249
    #[cfg(feature = "compiled_data")]
250
    pub(crate) fn new_southeast_asian() -> Self {
87✔
251
        #[allow(clippy::unwrap_used)]
252
        // try_load is infallible if the provider only returns `MissingLocale`.
253
        Self {
67✔
254
            grapheme: DataPayload::from_static_ref(
95✔
255
                crate::provider::Baked::SINGLETON_SEGMENTER_BREAK_GRAPHEME_CLUSTER_V1,
256
            ),
257
            my: try_load::<SegmenterDictionaryExtendedV1, _>(&crate::provider::Baked, MY_DICT)
87✔
258
                .unwrap()
259
                .map(DataPayload::cast)
260
                .map(Ok),
261
            km: try_load::<SegmenterDictionaryExtendedV1, _>(&crate::provider::Baked, KM_DICT)
77✔
262
                .unwrap()
263
                .map(DataPayload::cast)
264
                .map(Ok),
265
            lo: try_load::<SegmenterDictionaryExtendedV1, _>(&crate::provider::Baked, LO_DICT)
73✔
266
                .unwrap()
267
                .map(DataPayload::cast)
268
                .map(Ok),
269
            th: try_load::<SegmenterDictionaryExtendedV1, _>(&crate::provider::Baked, TH_DICT)
73✔
270
                .unwrap()
271
                .map(DataPayload::cast)
272
                .map(Ok),
273
            ja: None,
67✔
274
        }
×
275
    }
67✔
276

277
    pub(crate) fn try_new_southeast_asian<D>(provider: &D) -> Result<Self, DataError>
1✔
278
    where
279
        D: DataProvider<SegmenterDictionaryExtendedV1>
280
            + DataProvider<SegmenterBreakGraphemeClusterV1>
281
            + ?Sized,
282
    {
283
        Ok(Self {
1✔
284
            grapheme: provider.load(Default::default())?.payload,
1✔
285
            my: try_load::<SegmenterDictionaryExtendedV1, _>(provider, MY_DICT)?
1✔
286
                .map(DataPayload::cast)
287
                .map(Ok),
288
            km: try_load::<SegmenterDictionaryExtendedV1, _>(provider, KM_DICT)?
1✔
289
                .map(DataPayload::cast)
290
                .map(Ok),
291
            lo: try_load::<SegmenterDictionaryExtendedV1, _>(provider, LO_DICT)?
1✔
292
                .map(DataPayload::cast)
293
                .map(Ok),
294
            th: try_load::<SegmenterDictionaryExtendedV1, _>(provider, TH_DICT)?
1✔
295
                .map(DataPayload::cast)
296
                .map(Ok),
297
            ja: None,
1✔
298
        })
×
299
    }
1✔
300
}
301

302
fn try_load<M: DataMarker, P: DataProvider<M> + ?Sized>(
547✔
303
    provider: &P,
304
    model: &'static DataMarkerAttributes,
305
) -> Result<Option<DataPayload<M>>, DataError> {
306
    provider
547✔
307
        .load(DataRequest {
547✔
308
            id: DataIdentifierBorrowed::for_marker_attributes(model),
547✔
309
            metadata: {
310
                let mut m = DataRequestMetadata::default();
547✔
311
                m.silent = true;
547✔
312
                m.attributes_prefix_match = true;
547✔
313
                m
547✔
314
            },
315
        })
316
        .allow_identifier_not_found()
317
        .map(|r| r.map(|r| r.payload))
1,078✔
318
}
547✔
319

320
/// Return UTF-16 segment offset array using dictionary or lstm segmenter.
321
pub(crate) fn complex_language_segment_utf16(
36✔
322
    payloads: &ComplexPayloads,
323
    input: &[u16],
324
) -> Vec<usize> {
325
    let mut result = Vec::new();
36✔
326
    let mut offset = 0;
36✔
327
    for (slice, lang) in LanguageIteratorUtf16::new(input) {
72✔
328
        match payloads.select(lang) {
36✔
329
            Some(Ok(dict)) => {
12✔
330
                result.extend(
12✔
331
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
12✔
332
                        .segment_utf16(slice)
333
                        .map(|n| offset + n),
41✔
334
                );
335
            }
336
            #[cfg(feature = "lstm")]
337
            Some(Err(lstm)) => {
24✔
338
                result.extend(
24✔
339
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
24✔
340
                        .segment_utf16(slice)
341
                        .map(|n| offset + n),
87✔
342
                );
343
            }
344
            #[cfg(not(feature = "lstm"))]
345
            Some(Err(_infallible)) => {} // should be refutable
346
            None => {
347
                result.push(offset + slice.len());
×
348
            }
349
        }
350
        offset += slice.len();
36✔
351
    }
352
    result
36✔
353
}
36✔
354

355
/// Return UTF-8 segment offset array using dictionary or lstm segmenter.
356
pub(crate) fn complex_language_segment_str(payloads: &ComplexPayloads, input: &str) -> Vec<usize> {
42✔
357
    let mut result = Vec::new();
42✔
358
    let mut offset = 0;
42✔
359
    for (slice, lang) in LanguageIterator::new(input) {
84✔
360
        match payloads.select(lang) {
42✔
361
            Some(Ok(dict)) => {
18✔
362
                result.extend(
18✔
363
                    DictionarySegmenter::new(dict.get(), payloads.grapheme.get())
18✔
364
                        .segment_str(slice)
365
                        .map(|n| offset + n),
54✔
366
                );
367
            }
368
            #[cfg(feature = "lstm")]
369
            Some(Err(lstm)) => {
23✔
370
                result.extend(
23✔
371
                    LstmSegmenter::new(lstm.get(), payloads.grapheme.get())
23✔
372
                        .segment_str(slice)
373
                        .map(|n| offset + n),
84✔
374
                );
375
            }
376
            #[cfg(not(feature = "lstm"))]
377
            Some(Err(_infallible)) => {} // should be refutable
378
            None => {
379
                result.push(offset + slice.len());
1✔
380
            }
381
        }
382
        offset += slice.len();
42✔
383
    }
384
    result
42✔
385
}
42✔
386

387
#[cfg(test)]
388
#[cfg(feature = "serde")]
389
mod tests {
390
    use super::*;
391

392
    #[test]
393
    fn thai_word_break() {
2✔
394
        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
395
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1✔
396

397
        let lstm = ComplexPayloads::new_lstm();
1✔
398
        let dict = ComplexPayloads::new_dict();
1✔
399

400
        assert_eq!(
2✔
401
            complex_language_segment_str(&lstm, TEST_STR),
1✔
402
            [12, 21, 33, 42]
403
        );
404
        assert_eq!(
2✔
405
            complex_language_segment_utf16(&lstm, &utf16),
1✔
406
            [4, 7, 11, 14]
407
        );
408

409
        assert_eq!(
2✔
410
            complex_language_segment_str(&dict, TEST_STR),
1✔
411
            [12, 21, 33, 42]
412
        );
413
        assert_eq!(
2✔
414
            complex_language_segment_utf16(&dict, &utf16),
1✔
415
            [4, 7, 11, 14]
416
        );
417
    }
2✔
418
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc