• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #37619

13 Sep 2023 03:28AM UTC coverage: 86.645% (+1.6%) from 85.083%
#37619

push

local

web-flow
elaborate `incremental` argument of `Base.generating_output` (#51281)

Follows up #51216.

74006 of 85413 relevant lines covered (86.64%)

12982266.33 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

45.0
/base/strings/unicode.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
# Various Unicode functionality from the utf8proc library
4
module Unicode
5

6
import Base: show, ==, hash, string, Symbol, isless, length, eltype,
7
             convert, isvalid, ismalformed, isoverlong, iterate
8

9
# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
10

11
"""
12
    isvalid(value) -> Bool
13

14
Return `true` if the given value is valid for its type, which currently can be either
15
`AbstractChar` or `String` or `SubString{String}`.
16

17
# Examples
18
```jldoctest
19
julia> isvalid(Char(0xd800))
20
false
21

22
julia> isvalid(SubString(String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80]),1,2))
23
false
24

25
julia> isvalid(Char(0xd799))
26
true
27
```
28
"""
29
isvalid(value)
30

31
"""
32
    isvalid(T, value) -> Bool
33

34
Return `true` if the given value is valid for that type. Types currently can
35
be either `AbstractChar` or `String`. Values for `AbstractChar` can be of type `AbstractChar` or [`UInt32`](@ref).
36
Values for `String` can be of that type, `SubString{String}`, `Vector{UInt8}`,
37
or a contiguous subarray thereof.
38

39
# Examples
40
```jldoctest
41
julia> isvalid(Char, 0xd800)
42
false
43

44
julia> isvalid(String, SubString("thisisvalid",1,5))
45
true
46

47
julia> isvalid(Char, 0xd799)
48
true
49
```
50

51
!!! compat "Julia 1.6"
52
    Support for subarray values was added in Julia 1.6.
53
"""
54
isvalid(T,value)
55

56
isvalid(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
10,823,648✔
57
isvalid(::Type{<:AbstractChar}, c::Unsigned) = ((c ≤  0xd7ff ) | ( 0xe000  ≤ c) & (c ≤  0x10ffff ))
9✔
58
isvalid(::Type{T}, c::Integer) where {T<:AbstractChar}  = isvalid(T, Unsigned(c))
1✔
59
isvalid(::Type{<:AbstractChar}, c::AbstractChar)     = isvalid(c)
×
60

61
# utf8 category constants
62
const UTF8PROC_CATEGORY_CN = 0
63
const UTF8PROC_CATEGORY_LU = 1
64
const UTF8PROC_CATEGORY_LL = 2
65
const UTF8PROC_CATEGORY_LT = 3
66
const UTF8PROC_CATEGORY_LM = 4
67
const UTF8PROC_CATEGORY_LO = 5
68
const UTF8PROC_CATEGORY_MN = 6
69
const UTF8PROC_CATEGORY_MC = 7
70
const UTF8PROC_CATEGORY_ME = 8
71
const UTF8PROC_CATEGORY_ND = 9
72
const UTF8PROC_CATEGORY_NL = 10
73
const UTF8PROC_CATEGORY_NO = 11
74
const UTF8PROC_CATEGORY_PC = 12
75
const UTF8PROC_CATEGORY_PD = 13
76
const UTF8PROC_CATEGORY_PS = 14
77
const UTF8PROC_CATEGORY_PE = 15
78
const UTF8PROC_CATEGORY_PI = 16
79
const UTF8PROC_CATEGORY_PF = 17
80
const UTF8PROC_CATEGORY_PO = 18
81
const UTF8PROC_CATEGORY_SM = 19
82
const UTF8PROC_CATEGORY_SC = 20
83
const UTF8PROC_CATEGORY_SK = 21
84
const UTF8PROC_CATEGORY_SO = 22
85
const UTF8PROC_CATEGORY_ZS = 23
86
const UTF8PROC_CATEGORY_ZL = 24
87
const UTF8PROC_CATEGORY_ZP = 25
88
const UTF8PROC_CATEGORY_CC = 26
89
const UTF8PROC_CATEGORY_CF = 27
90
const UTF8PROC_CATEGORY_CS = 28
91
const UTF8PROC_CATEGORY_CO = 29
92

93
# strings corresponding to the category constants
94
const category_strings = [
95
    "Other, not assigned",
96
    "Letter, uppercase",
97
    "Letter, lowercase",
98
    "Letter, titlecase",
99
    "Letter, modifier",
100
    "Letter, other",
101
    "Mark, nonspacing",
102
    "Mark, spacing combining",
103
    "Mark, enclosing",
104
    "Number, decimal digit",
105
    "Number, letter",
106
    "Number, other",
107
    "Punctuation, connector",
108
    "Punctuation, dash",
109
    "Punctuation, open",
110
    "Punctuation, close",
111
    "Punctuation, initial quote",
112
    "Punctuation, final quote",
113
    "Punctuation, other",
114
    "Symbol, math",
115
    "Symbol, currency",
116
    "Symbol, modifier",
117
    "Symbol, other",
118
    "Separator, space",
119
    "Separator, line",
120
    "Separator, paragraph",
121
    "Other, control",
122
    "Other, format",
123
    "Other, surrogate",
124
    "Other, private use",
125
    "Invalid, too high",
126
    "Malformed, bad data",
127
]
128

129
const UTF8PROC_STABLE    = (1<<1)
130
const UTF8PROC_COMPAT    = (1<<2)
131
const UTF8PROC_COMPOSE   = (1<<3)
132
const UTF8PROC_DECOMPOSE = (1<<4)
133
const UTF8PROC_IGNORE    = (1<<5)
134
const UTF8PROC_REJECTNA  = (1<<6)
135
const UTF8PROC_NLF2LS    = (1<<7)
136
const UTF8PROC_NLF2PS    = (1<<8)
137
const UTF8PROC_NLF2LF    = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
138
const UTF8PROC_STRIPCC   = (1<<9)
139
const UTF8PROC_CASEFOLD  = (1<<10)
140
const UTF8PROC_CHARBOUND = (1<<11)
141
const UTF8PROC_LUMP      = (1<<12)
142
const UTF8PROC_STRIPMARK = (1<<13)
143

144
############################################################################
145

146
utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result)))
×
147

148
# static wrapper around user callback function
149
utf8proc_custom_func(codepoint::UInt32, callback::Any) =
×
150
    UInt32(callback(codepoint))::UInt32
151

152
function utf8proc_decompose(str, options, buffer, nwords, chartransform::typeof(identity))
×
153
    ret = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
40✔
154
                str, sizeof(str), buffer, nwords, options)
155
    ret < 0 && utf8proc_error(ret)
40✔
156
    return ret
40✔
157
end
158
function utf8proc_decompose(str, options, buffer, nwords, chartransform::T) where T
×
159
    ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ref{T}),
×
160
                str, sizeof(str), buffer, nwords, options,
161
                @cfunction(utf8proc_custom_func, UInt32, (UInt32, Ref{T})), chartransform)
×
162
    ret < 0 && utf8proc_error(ret)
×
163
    return ret
×
164
end
165

166
function utf8proc_map(str::Union{String,SubString{String}}, options::Integer, chartransform=identity)
39✔
167
    nwords = utf8proc_decompose(str, options, C_NULL, 0, chartransform)
40✔
168
    buffer = Base.StringVector(nwords*4)
20✔
169
    nwords = utf8proc_decompose(str, options, buffer, nwords, chartransform)
20✔
170
    nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
20✔
171
    nbytes < 0 && utf8proc_error(nbytes)
20✔
172
    return String(resize!(buffer, nbytes))
20✔
173
end
174

175
# from julia_charmap.h, used by julia_chartransform in the Unicode stdlib
176
const _julia_charmap = Dict{UInt32,UInt32}(
177
    0x025B => 0x03B5,
178
    0x00B5 => 0x03BC,
179
    0x00B7 => 0x22C5,
180
    0x0387 => 0x22C5,
181
    0x2212 => 0x002D,
182
)
183

184
utf8proc_map(s::AbstractString, flags::Integer, chartransform=identity) = utf8proc_map(String(s), flags, chartransform)
×
185

186
# Documented in Unicode module
187
function normalize(
×
188
    s::AbstractString;
189
    stable::Bool=false,
190
    compat::Bool=false,
191
    compose::Bool=true,
192
    decompose::Bool=false,
193
    stripignore::Bool=false,
194
    rejectna::Bool=false,
195
    newline2ls::Bool=false,
196
    newline2ps::Bool=false,
197
    newline2lf::Bool=false,
198
    stripcc::Bool=false,
199
    casefold::Bool=false,
200
    lump::Bool=false,
201
    stripmark::Bool=false,
202
    chartransform=identity,
203
)
204
    flags = 0
×
205
    stable && (flags = flags | UTF8PROC_STABLE)
×
206
    compat && (flags = flags | UTF8PROC_COMPAT)
×
207
    # TODO: error if compose & decompose?
208
    if decompose
×
209
        flags = flags | UTF8PROC_DECOMPOSE
×
210
    elseif compose
×
211
        flags = flags | UTF8PROC_COMPOSE
×
212
    elseif compat || stripmark
×
213
        throw(ArgumentError("compat=true or stripmark=true require compose=true or decompose=true"))
×
214
    end
215
    stripignore && (flags = flags | UTF8PROC_IGNORE)
×
216
    rejectna && (flags = flags | UTF8PROC_REJECTNA)
×
217
    newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
×
218
    newline2ls && (flags = flags | UTF8PROC_NLF2LS)
×
219
    newline2ps && (flags = flags | UTF8PROC_NLF2PS)
×
220
    newline2lf && (flags = flags | UTF8PROC_NLF2LF)
×
221
    stripcc && (flags = flags | UTF8PROC_STRIPCC)
×
222
    casefold && (flags = flags | UTF8PROC_CASEFOLD)
×
223
    lump && (flags = flags | UTF8PROC_LUMP)
×
224
    stripmark && (flags = flags | UTF8PROC_STRIPMARK)
×
225
    utf8proc_map(s, flags, chartransform)
×
226
end
227

228
function normalize(s::AbstractString, nf::Symbol)
19✔
229
    utf8proc_map(s, nf === :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
21✔
230
                    nf === :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
231
                    nf === :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
232
                                   | UTF8PROC_COMPAT) :
233
                    nf === :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
234
                                   | UTF8PROC_COMPAT) :
235
                    throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
236
end
237

238
############################################################################
239

240
## character column width function ##
241
"""
242
    textwidth(c)
243

244
Give the number of columns needed to print a character.
245

246
# Examples
247
```jldoctest
248
julia> textwidth('α')
249
1
250

251
julia> textwidth('⛵')
252
2
253
```
254
"""
255
function textwidth(c::AbstractChar)
2,262,274✔
256
    ismalformed(c) && return 1
2,262,274✔
257
    Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
2,262,270✔
258
end
259

260
"""
261
    textwidth(s::AbstractString)
262

263
Give the number of columns needed to print a string.
264

265
# Examples
266
```jldoctest
267
julia> textwidth("March")
268
5
269
```
270
"""
271
textwidth(s::AbstractString) = mapreduce(textwidth, +, s; init=0)
479,138✔
272

273
"""
274
    lowercase(c::AbstractChar)
275

276
Convert `c` to lowercase.
277

278
See also [`uppercase`](@ref), [`titlecase`](@ref).
279

280
# Examples
281
```jldoctest
282
julia> lowercase('A')
283
'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)
284

285
julia> lowercase('Ö')
286
'ö': Unicode U+00F6 (category Ll: Letter, lowercase)
287
```
288
"""
289
lowercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) :
194,299✔
290
    T(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
291

292
"""
293
    uppercase(c::AbstractChar)
294

295
Convert `c` to uppercase.
296

297
See also [`lowercase`](@ref), [`titlecase`](@ref).
298

299
# Examples
300
```jldoctest
301
julia> uppercase('a')
302
'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
303

304
julia> uppercase('ê')
305
'Ê': Unicode U+00CA (category Lu: Letter, uppercase)
306
```
307
"""
308
uppercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
14,054✔
309
    T(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
310

311
"""
312
    titlecase(c::AbstractChar)
313

314
Convert `c` to titlecase. This may differ from uppercase for digraphs,
315
compare the example below.
316

317
See also [`uppercase`](@ref), [`lowercase`](@ref).
318

319
# Examples
320
```jldoctest
321
julia> titlecase('a')
322
'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
323

324
julia> titlecase('dž')
325
'Dž': Unicode U+01C5 (category Lt: Letter, titlecase)
326

327
julia> uppercase('dž')
328
'DŽ': Unicode U+01C4 (category Lu: Letter, uppercase)
329
```
330
"""
331
titlecase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
174✔
332
    T(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
333

334
############################################################################
335

336
# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
337
function category_code(c::AbstractChar)
6,101,727✔
338
    !ismalformed(c) ? category_code(UInt32(c)) : Cint(31)
6,101,727✔
339
end
340

341
function category_code(x::Integer)
2✔
342
    x ≤ 0x10ffff ? ccall(:utf8proc_category, Cint, (UInt32,), x) : Cint(30)
6,101,921✔
343
end
344

345
# more human-readable representations of the category code
346
function category_abbrev(c::AbstractChar)
15✔
347
    ismalformed(c) && return "Ma"
15✔
348
    c ≤ '\U10ffff' || return "In"
14✔
349
    unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
14✔
350
end
351

352
category_string(c) = category_strings[category_code(c)+1]
15✔
353

354
isassigned(c) = UTF8PROC_CATEGORY_CN < category_code(c) <= UTF8PROC_CATEGORY_CO
×
355

356
## libc character class predicates ##
357

358
"""
359
    islowercase(c::AbstractChar) -> Bool
360

361
Tests whether a character is a lowercase letter (according to the Unicode
362
standard's `Lowercase` derived property).
363

364
See also [`isuppercase`](@ref).
365

366
# Examples
367
```jldoctest
368
julia> islowercase('α')
369
true
370

371
julia> islowercase('Γ')
372
false
373

374
julia> islowercase('❤')
375
false
376
```
377
"""
378
islowercase(c::AbstractChar) = ismalformed(c) ? false : Bool(ccall(:utf8proc_islower, Cint, (UInt32,), UInt32(c)))
162✔
379

380
# true for Unicode upper and mixed case
381

382
"""
383
    isuppercase(c::AbstractChar) -> Bool
384

385
Tests whether a character is an uppercase letter (according to the Unicode
386
standard's `Uppercase` derived property).
387

388
See also [`islowercase`](@ref).
389

390
# Examples
391
```jldoctest
392
julia> isuppercase('γ')
393
false
394

395
julia> isuppercase('Γ')
396
true
397

398
julia> isuppercase('❤')
399
false
400
```
401
"""
402
isuppercase(c::AbstractChar) = ismalformed(c) ? false : Bool(ccall(:utf8proc_isupper, Cint, (UInt32,), UInt32(c)))
26✔
403

404
"""
405
    iscased(c::AbstractChar) -> Bool
406

407
Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
408

409
See also [`islowercase`](@ref), [`isuppercase`](@ref).
410
"""
411
function iscased(c::AbstractChar)
×
412
    cat = category_code(c)
×
413
    return cat == UTF8PROC_CATEGORY_LU ||
×
414
           cat == UTF8PROC_CATEGORY_LT ||
415
           cat == UTF8PROC_CATEGORY_LL
416
end
417

418

419
"""
420
    isdigit(c::AbstractChar) -> Bool
421

422
Tests whether a character is a decimal digit (0-9).
423

424
See also: [`isletter`](@ref).
425

426
# Examples
427
```jldoctest
428
julia> isdigit('❤')
429
false
430

431
julia> isdigit('9')
432
true
433

434
julia> isdigit('α')
435
false
436
```
437
"""
438
isdigit(c::AbstractChar) = (c >= '0') & (c <= '9')
991,840✔
439

440
"""
441
    isletter(c::AbstractChar) -> Bool
442

443
Test whether a character is a letter.
444
A character is classified as a letter if it belongs to the Unicode general
445
category Letter, i.e. a character whose category code begins with 'L'.
446

447
See also: [`isdigit`](@ref).
448

449
# Examples
450
```jldoctest
451
julia> isletter('❤')
452
false
453

454
julia> isletter('α')
455
true
456

457
julia> isletter('9')
458
false
459
```
460
"""
461
isletter(c::AbstractChar) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO
361✔
462

463
"""
464
    isnumeric(c::AbstractChar) -> Bool
465

466
Tests whether a character is numeric.
467
A character is classified as numeric if it belongs to the Unicode general category Number,
468
i.e. a character whose category code begins with 'N'.
469

470
Note that this broad category includes characters such as ¾ and ௰.
471
Use [`isdigit`](@ref) to check whether a character is a decimal digit between 0 and 9.
472

473
# Examples
474
```jldoctest
475
julia> isnumeric('௰')
476
true
477

478
julia> isnumeric('9')
479
true
480

481
julia> isnumeric('α')
482
false
483

484
julia> isnumeric('❤')
485
false
486
```
487
"""
488
isnumeric(c::AbstractChar) = UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO
1,609✔
489

490
# following C++ only control characters from the Latin-1 subset return true
491

492
"""
493
    iscntrl(c::AbstractChar) -> Bool
494

495
Tests whether a character is a control character.
496
Control characters are the non-printing characters of the Latin-1 subset of Unicode.
497

498
# Examples
499
```jldoctest
500
julia> iscntrl('\\x01')
501
true
502

503
julia> iscntrl('a')
504
false
505
```
506
"""
507
iscntrl(c::AbstractChar) = c <= '\x1f' || '\x7f' <= c <= '\u9f'
5,458✔
508

509
"""
510
    ispunct(c::AbstractChar) -> Bool
511

512
Tests whether a character belongs to the Unicode general category Punctuation, i.e. a
513
character whose category code begins with 'P'.
514

515
# Examples
516
```jldoctest
517
julia> ispunct('α')
518
false
519

520
julia> ispunct('/')
521
true
522

523
julia> ispunct(';')
524
true
525
```
526
"""
527
ispunct(c::AbstractChar) = UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO
1,311✔
528

529
# \u85 is the Unicode Next Line (NEL) character
530

531
"""
532
    isspace(c::AbstractChar) -> Bool
533

534
Tests whether a character is any whitespace character. Includes ASCII characters '\\t',
535
'\\n', '\\v', '\\f', '\\r', and ' ', Latin-1 character U+0085, and characters in Unicode
536
category Zs.
537

538
# Examples
539
```jldoctest
540
julia> isspace('\\n')
541
true
542

543
julia> isspace('\\r')
544
true
545

546
julia> isspace(' ')
547
true
548

549
julia> isspace('\\x20')
550
true
551
```
552
"""
553
@inline isspace(c::AbstractChar) =
9,602,831✔
554
    c == ' ' || '\t' <= c <= '\r' || c == '\u85' ||
555
    '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
556

557
"""
558
    isprint(c::AbstractChar) -> Bool
559

560
Tests whether a character is printable, including spaces, but not a control character.
561

562
# Examples
563
```jldoctest
564
julia> isprint('\\x01')
565
false
566

567
julia> isprint('A')
568
true
569
```
570
"""
571
isprint(c::AbstractChar) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS
6,088,778✔
572

573
# true in principal if a printer would use ink
574

575
"""
576
    isxdigit(c::AbstractChar) -> Bool
577

578
Test whether a character is a valid hexadecimal digit. Note that this does not
579
include `x` (as in the standard `0x` prefix).
580

581
# Examples
582
```jldoctest
583
julia> isxdigit('a')
584
true
585

586
julia> isxdigit('x')
587
false
588
```
589
"""
590
isxdigit(c::AbstractChar) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F'
1,222✔
591

592
## uppercase, lowercase, and titlecase transformations ##
593

594
"""
595
    uppercase(s::AbstractString)
596

597
Return `s` with all characters converted to uppercase.
598

599
See also [`lowercase`](@ref), [`titlecase`](@ref), [`uppercasefirst`](@ref).
600

601
# Examples
602
```jldoctest
603
julia> uppercase("Julia")
604
"JULIA"
605
```
606
"""
607
uppercase(s::AbstractString) = map(uppercase, s)
724✔
608

609
"""
610
    lowercase(s::AbstractString)
611

612
Return `s` with all characters converted to lowercase.
613

614
See also [`uppercase`](@ref), [`titlecase`](@ref), [`lowercasefirst`](@ref).
615

616
# Examples
617
```jldoctest
618
julia> lowercase("STRINGS AND THINGS")
619
"strings and things"
620
```
621
"""
622
lowercase(s::AbstractString) = map(lowercase, s)
18,533✔
623

624
"""
625
    titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String
626

627
Capitalize the first character of each word in `s`;
628
if `strict` is true, every other character is
629
converted to lowercase, otherwise they are left unchanged.
630
By default, all non-letters beginning a new grapheme are considered as word separators;
631
a predicate can be passed as the `wordsep` keyword to determine
632
which characters should be considered as word separators.
633
See also [`uppercasefirst`](@ref) to capitalize only the first
634
character in `s`.
635

636
See also [`uppercase`](@ref), [`lowercase`](@ref), [`uppercasefirst`](@ref).
637

638
# Examples
639
```jldoctest
640
julia> titlecase("the JULIA programming language")
641
"The Julia Programming Language"
642

643
julia> titlecase("ISS - international space station", strict=false)
644
"ISS - International Space Station"
645

646
julia> titlecase("a-a b-b", wordsep = c->c==' ')
647
"A-a B-b"
648
```
649
"""
650
function titlecase(s::AbstractString; wordsep::Function = !isletter, strict::Bool=true)
4✔
651
    startword = true
2✔
652
    state = Ref{Int32}(0)
2✔
653
    c0 = eltype(s)(0x00000000)
2✔
654
    b = IOBuffer()
2✔
655
    for c in s
4✔
656
        # Note: It would be better to have a word iterator following UAX#29,
657
        # similar to our grapheme iterator, but utf8proc does not yet have
658
        # this information.  At the very least we shouldn't break inside graphemes.
659
        if isgraphemebreak!(state, c0, c) && wordsep(c)
5✔
660
            print(b, c)
×
661
            startword = true
×
662
        else
663
            print(b, startword ? titlecase(c) : strict ? lowercase(c) : c)
8✔
664
            startword = false
5✔
665
        end
666
        c0 = c
5✔
667
    end
8✔
668
    return String(take!(b))
2✔
669
end
670

671
"""
672
    uppercasefirst(s::AbstractString) -> String
673

674
Return `s` with the first character converted to uppercase (technically "title
675
case" for Unicode). See also [`titlecase`](@ref) to capitalize the first
676
character of every word in `s`.
677

678
See also [`lowercasefirst`](@ref), [`uppercase`](@ref), [`lowercase`](@ref),
679
[`titlecase`](@ref).
680

681
# Examples
682
```jldoctest
683
julia> uppercasefirst("python")
684
"Python"
685
```
686
"""
687
function uppercasefirst(s::AbstractString)
172✔
688
    isempty(s) && return ""
172✔
689
    c = s[1]
194✔
690
    c′ = titlecase(c)
172✔
691
    c == c′ ? convert(String, s) :
172✔
692
    string(c′, SubString(s, nextind(s, 1)))
693
end
694

695
"""
696
    lowercasefirst(s::AbstractString)
697

698
Return `s` with the first character converted to lowercase.
699

700
See also [`uppercasefirst`](@ref), [`uppercase`](@ref), [`lowercase`](@ref),
701
[`titlecase`](@ref).
702

703
# Examples
704
```jldoctest
705
julia> lowercasefirst("Julia")
706
"julia"
707
```
708
"""
709
function lowercasefirst(s::AbstractString)
×
710
    isempty(s) && return ""
×
711
    c = s[1]
×
712
    c′ = lowercase(c)
×
713
    c == c′ ? convert(String, s) :
×
714
    string(c′, SubString(s, nextind(s, 1)))
715
end
716

717
############################################################################
718
# iterators for grapheme segmentation
719

720
isgraphemebreak(c1::AbstractChar, c2::AbstractChar) =
×
721
    ismalformed(c1) || ismalformed(c2) ||
722
    ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
723

724
# Stateful grapheme break required by Unicode-9 rules: the string
725
# must be processed in sequence, with state initialized to Ref{Int32}(0).
726
# Requires utf8proc v2.0 or later.
727
function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar)
5✔
728
    if ismalformed(c1) || ismalformed(c2)
10✔
729
        state[] = 0
×
730
        return true
×
731
    end
732
    ccall(:utf8proc_grapheme_break_stateful, Bool,
5✔
733
          (UInt32, UInt32, Ref{Int32}), c1, c2, state)
734
end
735

736
struct GraphemeIterator{S<:AbstractString}
737
    s::S # original string (for generation of SubStrings)
738
end
739

740
# Documented in Unicode module
741
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
×
742

743
eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}
×
744
eltype(::Type{GraphemeIterator{SubString{S}}}) where {S} = SubString{S}
×
745

746
function length(g::GraphemeIterator{S}) where {S}
×
747
    c0 = eltype(S)(0x00000000)
×
748
    n = 0
×
749
    state = Ref{Int32}(0)
×
750
    for c in g.s
×
751
        n += isgraphemebreak!(state, c0, c)
×
752
        c0 = c
×
753
    end
×
754
    return n
×
755
end
756

757
function iterate(g::GraphemeIterator, i_=(Int32(0),firstindex(g.s)))
×
758
    s = g.s
×
759
    statei, i = i_
×
760
    state = Ref{Int32}(statei)
×
761
    j = i
×
762
    y = iterate(s, i)
×
763
    y === nothing && return nothing
×
764
    c0, k = y
×
765
    while k <= ncodeunits(s) # loop until next grapheme is s[i:j]
×
766
        c, ℓ = iterate(s, k)::NTuple{2,Any}
×
767
        isgraphemebreak!(state, c0, c) && break
×
768
        j = k
×
769
        k = ℓ
×
770
        c0 = c
×
771
    end
×
772
    return (SubString(s, i, j), (state[], k))
×
773
end
774

775
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
×
776
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
×
777
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
×
778

779
show(io::IO, g::GraphemeIterator{S}) where {S} = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
×
780

781
############################################################################
782

783
end # module
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc