• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

zbraniecki / icu4x / 6815798908

09 Nov 2023 05:17PM UTC coverage: 72.607% (-2.4%) from 75.01%
6815798908

push

github

web-flow
Implement `Any/BufferProvider` for some smart pointers (#4255)

Allows storing them as a `Box<dyn Any/BufferProvider>` without using a
wrapper type that implements the trait.

44281 of 60987 relevant lines covered (72.61%)

201375.86 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

60.78
/components/segmenter/src/sentence.rs
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4

5
use alloc::vec::Vec;
6
use icu_provider::prelude::*;
7

8
use crate::indices::{Latin1Indices, Utf16Indices};
9
use crate::iterator_helpers::derive_usize_iterator_with_type;
10
use crate::rule_segmenter::*;
11
use crate::{provider::*, SegmenterError};
12
use utf8_iter::Utf8CharIndices;
13

14
/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
15
///
16
/// Lifetimes:
17
///
18
/// - `'l` = lifetime of the segmenter object from which this iterator was created
19
/// - `'s` = lifetime of the string being segmented
20
///
21
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
22
/// _after_ the boundary (for a boundary at the end of text, this index is the length
23
/// of the [`str`] or array of code units).
24
///
25
/// For examples of use, see [`SentenceSegmenter`].
26
#[derive(Debug)]
27
pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>(
28
    RuleBreakIterator<'l, 's, Y>,
29
);
30

31
derive_usize_iterator_with_type!(SentenceBreakIterator);
32

33
/// Sentence break iterator for an `str` (a UTF-8 string).
34
///
35
/// For examples of use, see [`SentenceSegmenter`].
36
pub type SentenceBreakIteratorUtf8<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf8>;
37

38
/// Sentence break iterator for a potentially invalid UTF-8 string.
39
///
40
/// For examples of use, see [`SentenceSegmenter`].
41
pub type SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
42
    SentenceBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>;
43

44
/// Sentence break iterator for a Latin-1 (8-bit) string.
45
///
46
/// For examples of use, see [`SentenceSegmenter`].
47
pub type SentenceBreakIteratorLatin1<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeLatin1>;
48

49
/// Sentence break iterator for a UTF-16 string.
50
///
51
/// For examples of use, see [`SentenceSegmenter`].
52
pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf16>;
53

54
/// Supports loading sentence break data, and creating sentence break iterators for different string
55
/// encodings.
56
///
57
/// # Examples
58
///
59
/// Segment a string:
60
///
61
/// ```rust
62
/// use icu_segmenter::SentenceSegmenter;
63
/// let segmenter = SentenceSegmenter::new();
64
///
65
/// let breakpoints: Vec<usize> =
66
///     segmenter.segment_str("Hello World").collect();
67
/// assert_eq!(&breakpoints, &[0, 11]);
68
/// ```
69
///
70
/// Segment a Latin1 byte string:
71
///
72
/// ```rust
73
/// use icu_segmenter::SentenceSegmenter;
74
/// let segmenter = SentenceSegmenter::new();
75
///
76
/// let breakpoints: Vec<usize> =
77
///     segmenter.segment_latin1(b"Hello World").collect();
78
/// assert_eq!(&breakpoints, &[0, 11]);
79
/// ```
80
///
81
/// Successive boundaries can be used to retrieve the sentences.
82
/// In particular, the first boundary is always 0, and the last one is the
83
/// length of the segmented text in code units.
84
///
85
/// ```rust
86
/// # use icu_segmenter::SentenceSegmenter;
87
/// # let segmenter = SentenceSegmenter::new();
88
/// use itertools::Itertools;
89
/// let text = "Ceci tuera cela. Le livre tuera l’édifice.";
90
/// let sentences: Vec<&str> = segmenter
91
///     .segment_str(text)
92
///     .tuple_windows()
93
///     .map(|(i, j)| &text[i..j])
94
///     .collect();
95
/// assert_eq!(
96
///     &sentences,
97
///     &["Ceci tuera cela. ", "Le livre tuera l’édifice."]
98
/// );
99
/// ```
100
#[derive(Debug)]
×
101
pub struct SentenceSegmenter {
102
    payload: DataPayload<SentenceBreakDataV1Marker>,
×
103
}
104

105
#[cfg(feature = "compiled_data")]
106
impl Default for SentenceSegmenter {
107
    fn default() -> Self {
×
108
        Self::new()
×
109
    }
×
110
}
111

112
impl SentenceSegmenter {
113
    /// Constructs a [`SentenceSegmenter`] with an invariant locale and compiled data.
114
    ///
115
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
116
    ///
117
    /// [📚 Help choosing a constructor](icu_provider::constructors)
118
    #[cfg(feature = "compiled_data")]
119
    pub fn new() -> Self {
7✔
120
        Self {
7✔
121
            payload: DataPayload::from_static_ref(
7✔
122
                crate::provider::Baked::SINGLETON_SEGMENTER_SENTENCE_V1,
123
            ),
124
        }
125
    }
7✔
126

127
    icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError,
128
        #[cfg(skip)]
129
        functions: [
130
            new,
131
            try_new_with_any_provider,
132
            try_new_with_buffer_provider,
133
            try_new_unstable,
134
            Self,
135
        ]
136
    );
137

138
    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
139
    pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
×
140
    where
141
        D: DataProvider<SentenceBreakDataV1Marker> + ?Sized,
142
    {
143
        let payload = provider.load(Default::default())?.take_payload()?;
×
144
        Ok(Self { payload })
×
145
    }
×
146

147
    /// Creates a sentence break iterator for an `str` (a UTF-8 string).
148
    ///
149
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
150
    pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> {
536✔
151
        SentenceBreakIterator(RuleBreakIterator {
536✔
152
            iter: input.char_indices(),
536✔
153
            len: input.len(),
536✔
154
            current_pos_data: None,
536✔
155
            result_cache: Vec::new(),
536✔
156
            data: self.payload.get(),
536✔
157
            complex: None,
158
            boundary_property: 0,
159
        })
×
160
    }
536✔
161
    /// Creates a sentence break iterator for a potentially ill-formed UTF8 string
162
    ///
163
    /// Invalid characters are treated as REPLACEMENT CHARACTER
164
    ///
165
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
166
    pub fn segment_utf8<'l, 's>(
×
167
        &'l self,
168
        input: &'s [u8],
169
    ) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
170
        SentenceBreakIterator(RuleBreakIterator {
×
171
            iter: Utf8CharIndices::new(input),
×
172
            len: input.len(),
173
            current_pos_data: None,
×
174
            result_cache: Vec::new(),
×
175
            data: self.payload.get(),
×
176
            complex: None,
177
            boundary_property: 0,
178
        })
×
179
    }
×
180
    /// Creates a sentence break iterator for a Latin-1 (8-bit) string.
181
    ///
182
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
183
    pub fn segment_latin1<'l, 's>(
194✔
184
        &'l self,
185
        input: &'s [u8],
186
    ) -> SentenceBreakIteratorLatin1<'l, 's> {
187
        SentenceBreakIterator(RuleBreakIterator {
194✔
188
            iter: Latin1Indices::new(input),
194✔
189
            len: input.len(),
190
            current_pos_data: None,
194✔
191
            result_cache: Vec::new(),
194✔
192
            data: self.payload.get(),
194✔
193
            complex: None,
194
            boundary_property: 0,
195
        })
×
196
    }
194✔
197

198
    /// Creates a sentence break iterator for a UTF-16 string.
199
    ///
200
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
201
    pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> {
532✔
202
        SentenceBreakIterator(RuleBreakIterator {
532✔
203
            iter: Utf16Indices::new(input),
532✔
204
            len: input.len(),
205
            current_pos_data: None,
532✔
206
            result_cache: Vec::new(),
532✔
207
            data: self.payload.get(),
532✔
208
            complex: None,
209
            boundary_property: 0,
210
        })
×
211
    }
532✔
212
}
213

214
#[cfg(all(test, feature = "serde"))]
215
#[test]
216
fn empty_string() {
2✔
217
    let segmenter = SentenceSegmenter::new();
1✔
218
    let breaks: Vec<usize> = segmenter.segment_str("").collect();
1✔
219
    assert_eq!(breaks, [0]);
1✔
220
}
2✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc