• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

JuliaLang / julia / #38182

15 Aug 2025 03:55AM UTC coverage: 77.87% (-0.4%) from 78.28%
#38182

push

local

web-flow
🤖 [master] Bump the SparseArrays stdlib from 30201ab to bb5ecc0 (#59263)

Stdlib: SparseArrays
URL: https://github.com/JuliaSparse/SparseArrays.jl.git
Stdlib branch: main
Julia branch: master
Old commit: 30201ab
New commit: bb5ecc0
Julia version: 1.13.0-DEV
SparseArrays version: 1.13.0
Bump invoked by: @ViralBShah
Powered by:
[BumpStdlibs.jl](https://github.com/JuliaLang/BumpStdlibs.jl)

Diff:
https://github.com/JuliaSparse/SparseArrays.jl/compare/30201abcb...bb5ecc091

```
$ git log --oneline 30201ab..bb5ecc0
bb5ecc0 fast quadratic form for dense matrix, sparse vectors (#640)
34ece87 Extend 3-arg `dot` to generic `HermOrSym` sparse matrices (#643)
095b685 Exclude unintended complex symmetric sparse matrices from 3-arg `dot` (#642)
8049287 Fix signature for 2-arg matrix-matrix `dot` (#641)
cff971d Make cond(::SparseMatrix, 1 / Inf) discoverable from 2-norm error (#629)
```

Co-authored-by: ViralBShah <744411+ViralBShah@users.noreply.github.com>

48274 of 61993 relevant lines covered (77.87%)

9571166.83 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.22
/base/strings/unicode.jl
1
# This file is a part of Julia. License is MIT: https://julialang.org/license
2

3
# Various Unicode functionality from the utf8proc library
4
module Unicode
5

6
import Base: show, ==, hash, string, Symbol, isless, length, eltype,
7
             convert, isvalid, ismalformed, isoverlong, iterate,
8
             AnnotatedString, AnnotatedChar, annotated_chartransform,
9
             @assume_effects, annotations, is_overlong_enc
10

11
# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
12

13
"""
14
    isvalid(value)::Bool
15

16
Return `true` if the given value is valid for its type, which currently can be either
17
`AbstractChar` or `String` or `SubString{String}`.
18

19
# Examples
20
```jldoctest
21
julia> isvalid(Char(0xd800))
22
false
23

24
julia> isvalid(SubString(String(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80]),1,2))
25
false
26

27
julia> isvalid(Char(0xd799))
28
true
29
```
30
"""
31
isvalid(value)
32

33
"""
34
    isvalid(T, value)::Bool
35

36
Return `true` if the given value is valid for that type. Types currently can
37
be either `AbstractChar` or `String`. Values for `AbstractChar` can be of type `AbstractChar` or [`UInt32`](@ref).
38
Values for `String` can be of that type, `SubString{String}`, `Vector{UInt8}`,
39
or a contiguous subarray thereof.
40

41
# Examples
42
```jldoctest
43
julia> isvalid(Char, 0xd800)
44
false
45

46
julia> isvalid(String, SubString("thisisvalid",1,5))
47
true
48

49
julia> isvalid(Char, 0xd799)
50
true
51
```
52

53
!!! compat "Julia 1.6"
54
    Support for subarray values was added in Julia 1.6.
55
"""
56
isvalid(T,value)
57

58
isvalid(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
82,753,997✔
59
isvalid(::Type{<:AbstractChar}, c::Unsigned) = ((c ≤  0xd7ff ) | ( 0xe000  ≤ c) & (c ≤  0x10ffff ))
9✔
60
isvalid(::Type{T}, c::Integer) where {T<:AbstractChar}  = isvalid(T, Unsigned(c))
1✔
61
isvalid(::Type{<:AbstractChar}, c::AbstractChar)     = isvalid(c)
2✔
62

63
# utf8 category constants
64
const UTF8PROC_CATEGORY_CN = 0
65
const UTF8PROC_CATEGORY_LU = 1
66
const UTF8PROC_CATEGORY_LL = 2
67
const UTF8PROC_CATEGORY_LT = 3
68
const UTF8PROC_CATEGORY_LM = 4
69
const UTF8PROC_CATEGORY_LO = 5
70
const UTF8PROC_CATEGORY_MN = 6
71
const UTF8PROC_CATEGORY_MC = 7
72
const UTF8PROC_CATEGORY_ME = 8
73
const UTF8PROC_CATEGORY_ND = 9
74
const UTF8PROC_CATEGORY_NL = 10
75
const UTF8PROC_CATEGORY_NO = 11
76
const UTF8PROC_CATEGORY_PC = 12
77
const UTF8PROC_CATEGORY_PD = 13
78
const UTF8PROC_CATEGORY_PS = 14
79
const UTF8PROC_CATEGORY_PE = 15
80
const UTF8PROC_CATEGORY_PI = 16
81
const UTF8PROC_CATEGORY_PF = 17
82
const UTF8PROC_CATEGORY_PO = 18
83
const UTF8PROC_CATEGORY_SM = 19
84
const UTF8PROC_CATEGORY_SC = 20
85
const UTF8PROC_CATEGORY_SK = 21
86
const UTF8PROC_CATEGORY_SO = 22
87
const UTF8PROC_CATEGORY_ZS = 23
88
const UTF8PROC_CATEGORY_ZL = 24
89
const UTF8PROC_CATEGORY_ZP = 25
90
const UTF8PROC_CATEGORY_CC = 26
91
const UTF8PROC_CATEGORY_CF = 27
92
const UTF8PROC_CATEGORY_CS = 28
93
const UTF8PROC_CATEGORY_CO = 29
94

95
# strings corresponding to the category constants
96
const category_strings = [
97
    "Other, not assigned",
98
    "Letter, uppercase",
99
    "Letter, lowercase",
100
    "Letter, titlecase",
101
    "Letter, modifier",
102
    "Letter, other",
103
    "Mark, nonspacing",
104
    "Mark, spacing combining",
105
    "Mark, enclosing",
106
    "Number, decimal digit",
107
    "Number, letter",
108
    "Number, other",
109
    "Punctuation, connector",
110
    "Punctuation, dash",
111
    "Punctuation, open",
112
    "Punctuation, close",
113
    "Punctuation, initial quote",
114
    "Punctuation, final quote",
115
    "Punctuation, other",
116
    "Symbol, math",
117
    "Symbol, currency",
118
    "Symbol, modifier",
119
    "Symbol, other",
120
    "Separator, space",
121
    "Separator, line",
122
    "Separator, paragraph",
123
    "Other, control",
124
    "Other, format",
125
    "Other, surrogate",
126
    "Other, private use",
127
    "Invalid, too high",
128
    "Malformed, bad data",
129
]
130

131
const UTF8PROC_STABLE    = (1<<1)
132
const UTF8PROC_COMPAT    = (1<<2)
133
const UTF8PROC_COMPOSE   = (1<<3)
134
const UTF8PROC_DECOMPOSE = (1<<4)
135
const UTF8PROC_IGNORE    = (1<<5)
136
const UTF8PROC_REJECTNA  = (1<<6)
137
const UTF8PROC_NLF2LS    = (1<<7)
138
const UTF8PROC_NLF2PS    = (1<<8)
139
const UTF8PROC_NLF2LF    = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
140
const UTF8PROC_STRIPCC   = (1<<9)
141
const UTF8PROC_CASEFOLD  = (1<<10)
142
const UTF8PROC_CHARBOUND = (1<<11)
143
const UTF8PROC_LUMP      = (1<<12)
144
const UTF8PROC_STRIPMARK = (1<<13)
145

146
############################################################################
147

148
utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result)))
×
149

150
# static wrapper around user callback function
151
utf8proc_custom_func(codepoint::UInt32, callback::Any) =
22✔
152
    UInt32(callback(codepoint))::UInt32
153

154
function utf8proc_decompose(str, options, buffer, nwords, chartransform::typeof(identity))
×
155
    ret = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
2✔
156
                str, sizeof(str), buffer, nwords, options)
157
    ret < 0 && utf8proc_error(ret)
2✔
158
    return ret
2✔
159
end
160
function utf8proc_decompose(str, options, buffer, nwords, chartransform::F) where F
161
    ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ref{F}),
2✔
162
                str, sizeof(str), buffer, nwords, options,
163
                @cfunction(utf8proc_custom_func, UInt32, (UInt32, Ref{F})), chartransform)
164
    ret < 0 && utf8proc_error(ret)
2✔
165
    return ret
2✔
166
end
167

168
function utf8proc_map(str::Union{String,SubString{String}}, options::Integer, chartransform::F = identity) where F
2✔
169
    nwords = utf8proc_decompose(str, options, C_NULL, 0, chartransform)
202✔
170
    buffer = Base.StringVector(nwords*4)
2✔
171
    nwords = utf8proc_decompose(str, options, buffer, nwords, chartransform)
2✔
172
    nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
2✔
173
    nbytes < 0 && utf8proc_error(nbytes)
2✔
174
    return String(resize!(buffer, nbytes))
2✔
175
end
176

177
"""
178
`Dict` of `original codepoint => replacement codepoint` normalizations
179
to perform on Julia identifiers, to canonicalize characters that
180
are both easily confused and easily inputted by accident.
181

182
!!! warning
183
    When this table is updated, also update the corresponding table in `src/flisp/julia_charmap.h`.
184
"""
185
const _julia_charmap = Dict{UInt32,UInt32}(
186
    0x025B => 0x03B5, # latin small letter open e -> greek small letter epsilon
187
    0x00B5 => 0x03BC, # micro sign -> greek small letter mu
188
    0x00B7 => 0x22C5, # middot char -> dot operator (#25098)
189
    0x0387 => 0x22C5, # Greek interpunct -> dot operator (#25098)
190
    0x2212 => 0x002D, # minus -> hyphen-minus (#26193)
191
    0x210F => 0x0127, # hbar -> small letter h with stroke (#48870)
192
)
193

194
utf8proc_map(s::AbstractString, flags::Integer, chartransform::F = identity) where F = utf8proc_map(String(s), flags, chartransform)
2✔
195

196
# Documented in Unicode module
197
function normalize(
651✔
198
    s::AbstractString;
199
    stable::Bool=false,
200
    compat::Bool=false,
201
    compose::Bool=true,
202
    decompose::Bool=false,
203
    stripignore::Bool=false,
204
    rejectna::Bool=false,
205
    newline2ls::Bool=false,
206
    newline2ps::Bool=false,
207
    newline2lf::Bool=false,
208
    stripcc::Bool=false,
209
    casefold::Bool=false,
210
    lump::Bool=false,
211
    stripmark::Bool=false,
212
    chartransform=identity,
213
)
214
    flags = 0
649✔
215
    stable && (flags = flags | UTF8PROC_STABLE)
649✔
216
    compat && (flags = flags | UTF8PROC_COMPAT)
649✔
217
    # TODO: error if compose & decompose?
218
    if decompose
649✔
219
        flags = flags | UTF8PROC_DECOMPOSE
1✔
220
    elseif compose
648✔
221
        flags = flags | UTF8PROC_COMPOSE
646✔
222
    elseif compat || stripmark
2✔
223
        throw(ArgumentError("compat=true or stripmark=true require compose=true or decompose=true"))
2✔
224
    end
225
    stripignore && (flags = flags | UTF8PROC_IGNORE)
647✔
226
    rejectna && (flags = flags | UTF8PROC_REJECTNA)
647✔
227
    newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
647✔
228
    newline2ls && (flags = flags | UTF8PROC_NLF2LS)
647✔
229
    newline2ps && (flags = flags | UTF8PROC_NLF2PS)
647✔
230
    newline2lf && (flags = flags | UTF8PROC_NLF2LF)
647✔
231
    stripcc && (flags = flags | UTF8PROC_STRIPCC)
647✔
232
    casefold && (flags = flags | UTF8PROC_CASEFOLD)
647✔
233
    lump && (flags = flags | UTF8PROC_LUMP)
647✔
234
    stripmark && (flags = flags | UTF8PROC_STRIPMARK)
647✔
235
    utf8proc_map(s, flags, chartransform)
647✔
236
end
237

238
function normalize(s::AbstractString, nf::Symbol)
×
239
    utf8proc_map(s, nf === :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
241✔
240
                    nf === :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
241
                    nf === :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
242
                                   | UTF8PROC_COMPAT) :
243
                    nf === :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
244
                                   | UTF8PROC_COMPAT) :
245
                    throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
246
end
247

248
############################################################################
249

250
## character column width function ##
251
"""
252
    textwidth(c)
253

254
Give the number of columns needed to print a character.
255

256
# Examples
257
```jldoctest
258
julia> textwidth('α')
259
1
260

261
julia> textwidth('⛵')
262
2
263
```
264
"""
265
textwidth(c::AbstractChar) = textwidth(Char(c)::Char)
724,604✔
266

267
function textwidth(c::Char)
24✔
268
    u = reinterpret(UInt32, c)
1,359,966✔
269
    b = bswap(u) # from isascii(c)
1,359,966✔
270
    b < 0x7f && return Int(b >= 0x20) # ASCII fast path
1,359,966✔
271
    # We can't know a priori how terminals will render invalid UTF8 chars,
272
    # so we conservatively decide a width of 1.
273
    (ismalformed(c) || is_overlong_enc(u)) && return 1
55,510✔
274
    Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
55,508✔
275
end
276

277
"""
278
    textwidth(s::AbstractString)
279

280
Give the number of columns needed to print a string.
281

282
# Examples
283
```jldoctest
284
julia> textwidth("March")
285
5
286
```
287
"""
288
textwidth(s::AbstractString) = mapreduce(textwidth, +, s; init=0)
2,949,011✔
289

290
textwidth(s::AnnotatedString) = textwidth(s.string)
1,107✔
291

292
"""
293
    lowercase(c::AbstractChar)
294

295
Convert `c` to lowercase.
296

297
See also [`uppercase`](@ref), [`titlecase`](@ref).
298

299
# Examples
300
```jldoctest
301
julia> lowercase('A')
302
'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)
303

304
julia> lowercase('Ö')
305
'ö': Unicode U+00F6 (category Ll: Letter, lowercase)
306
```
307
"""
308
lowercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) :
13,510✔
309
    T(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
310

311
lowercase(c::AnnotatedChar) = AnnotatedChar(lowercase(c.char), annotations(c))
1✔
312

313
"""
314
    uppercase(c::AbstractChar)
315

316
Convert `c` to uppercase.
317

318
See also [`lowercase`](@ref), [`titlecase`](@ref).
319

320
# Examples
321
```jldoctest
322
julia> uppercase('a')
323
'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
324

325
julia> uppercase('ê')
326
'Ê': Unicode U+00CA (category Lu: Letter, uppercase)
327
```
328
"""
329
uppercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
8,154,163✔
330
    T(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
331

332
uppercase(c::AnnotatedChar) = AnnotatedChar(uppercase(c.char), annotations(c))
1✔
333

334
"""
335
    titlecase(c::AbstractChar)
336

337
Convert `c` to titlecase. This may differ from uppercase for digraphs,
338
compare the example below.
339

340
See also [`uppercase`](@ref), [`lowercase`](@ref).
341

342
# Examples
343
```jldoctest
344
julia> titlecase('a')
345
'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
346

347
julia> titlecase('dž')
348
'Dž': Unicode U+01C5 (category Lt: Letter, titlecase)
349

350
julia> uppercase('dž')
351
'DŽ': Unicode U+01C4 (category Lu: Letter, uppercase)
352
```
353
"""
354
titlecase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
577✔
355
    T(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
356

357
titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c))
1✔
358

359
############################################################################
360

361
# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
362
function category_code(c::AbstractChar)
1,483✔
363
    !ismalformed(c) ? category_code(UInt32(c)) : Cint(31)
116,591,508✔
364
end
365

366
function category_code(x::Integer)
1,477✔
367
    x ≤ 0x10ffff ? (@assume_effects :foldable @ccall utf8proc_category(UInt32(x)::UInt32)::Cint) : Cint(30)
116,590,323✔
368
end
369

370
# more human-readable representations of the category code
371
function category_abbrev(c::AbstractChar)
19✔
372
    ismalformed(c) && return "Ma"
19✔
373
    c ≤ '\U10ffff' || return "In"
18✔
374
    unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
18✔
375
end
376

377
category_string(c) = category_strings[category_code(c)+1]
19✔
378

379
isassigned(c) = UTF8PROC_CATEGORY_CN < category_code(c) <= UTF8PROC_CATEGORY_CO
22✔
380

381
## libc character class predicates ##
382

383
"""
384
    islowercase(c::AbstractChar)::Bool
385

386
Tests whether a character is a lowercase letter (according to the Unicode
387
standard's `Lowercase` derived property).
388

389
See also [`isuppercase`](@ref).
390

391
# Examples
392
```jldoctest
393
julia> islowercase('α')
394
true
395

396
julia> islowercase('Γ')
397
false
398

399
julia> islowercase('❤')
400
false
401
```
402
"""
403
islowercase(c::AbstractChar) = ismalformed(c) ? false :
489✔
404
    Bool(@assume_effects :foldable @ccall utf8proc_islower(UInt32(c)::UInt32)::Cint)
405

406
# true for Unicode upper and mixed case
407

408
"""
409
    isuppercase(c::AbstractChar)::Bool
410

411
Tests whether a character is an uppercase letter (according to the Unicode
412
standard's `Uppercase` derived property).
413

414
See also [`islowercase`](@ref).
415

416
# Examples
417
```jldoctest
418
julia> isuppercase('γ')
419
false
420

421
julia> isuppercase('Γ')
422
true
423

424
julia> isuppercase('❤')
425
false
426
```
427
"""
428
isuppercase(c::AbstractChar) = ismalformed(c) ? false :
537✔
429
    Bool(@assume_effects :foldable @ccall utf8proc_isupper(UInt32(c)::UInt32)::Cint)
430

431
"""
432
    iscased(c::AbstractChar)::Bool
433

434
Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
435

436
See also [`islowercase`](@ref), [`isuppercase`](@ref).
437
"""
438
function iscased(c::AbstractChar)
1✔
439
    cat = category_code(c)
9✔
440
    return cat == UTF8PROC_CATEGORY_LU ||
15✔
441
           cat == UTF8PROC_CATEGORY_LT ||
442
           cat == UTF8PROC_CATEGORY_LL
443
end
444

445

446
"""
447
    isdigit(c::AbstractChar)::Bool
448

449
Tests whether a character is an ASCII decimal digit (`0`-`9`).
450

451
See also: [`isletter`](@ref).
452

453
# Examples
454
```jldoctest
455
julia> isdigit('❤')
456
false
457

458
julia> isdigit('9')
459
true
460

461
julia> isdigit('α')
462
false
463
```
464
"""
465
isdigit(c::AbstractChar) = (c >= '0') & (c <= '9')
705,495✔
466

467
"""
468
    isletter(c::AbstractChar)::Bool
469

470
Test whether a character is a letter.
471
A character is classified as a letter if it belongs to the Unicode general
472
category Letter, i.e. a character whose category code begins with 'L'.
473

474
See also: [`isdigit`](@ref).
475

476
# Examples
477
```jldoctest
478
julia> isletter('❤')
479
false
480

481
julia> isletter('α')
482
true
483

484
julia> isletter('9')
485
false
486
```
487
"""
488
isletter(c::AbstractChar) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO
797✔
489

490
"""
491
    isnumeric(c::AbstractChar)::Bool
492

493
Tests whether a character is numeric.
494
A character is classified as numeric if it belongs to the Unicode general category Number,
495
i.e. a character whose category code begins with 'N'.
496

497
Note that this broad category includes characters such as ¾ and ௰.
498
Use [`isdigit`](@ref) to check whether a character is a decimal digit between 0 and 9.
499

500
# Examples
501
```jldoctest
502
julia> isnumeric('௰')
503
true
504

505
julia> isnumeric('9')
506
true
507

508
julia> isnumeric('α')
509
false
510

511
julia> isnumeric('❤')
512
false
513
```
514
"""
515
isnumeric(c::AbstractChar) = UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO
132✔
516

517
# following C++ only control characters from the Latin-1 subset return true
518

519
"""
520
    iscntrl(c::AbstractChar)::Bool
521

522
Tests whether a character is a control character.
523
Control characters are the non-printing characters of the Latin-1 subset of Unicode.
524

525
# Examples
526
```jldoctest
527
julia> iscntrl('\\x01')
528
true
529

530
julia> iscntrl('a')
531
false
532
```
533
"""
534
iscntrl(c::AbstractChar) = c <= '\x1f' || '\x7f' <= c <= '\u9f'
15,745✔
535

536
"""
537
    ispunct(c::AbstractChar)::Bool
538

539
Tests whether a character belongs to the Unicode general category Punctuation, i.e. a
540
character whose category code begins with 'P'.
541

542
!!! note
543
    This behavior is different from the `ispunct` function in C.
544

545
# Examples
546
```jldoctest
547
julia> ispunct('α')
548
false
549

550
julia> ispunct('=')
551
false
552

553
julia> ispunct('/')
554
true
555

556
julia> ispunct(';')
557
true
558
```
559
"""
560
ispunct(c::AbstractChar) = UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO
56✔
561

562
# \u85 is the Unicode Next Line (NEL) character
563

564
"""
565
    isspace(c::AbstractChar)::Bool
566

567
Tests whether a character is any whitespace character. Includes ASCII characters '\\t',
568
'\\n', '\\v', '\\f', '\\r', and ' ', Latin-1 character U+0085, and characters in Unicode
569
category Zs.
570

571
# Examples
572
```jldoctest
573
julia> isspace('\\n')
574
true
575

576
julia> isspace('\\r')
577
true
578

579
julia> isspace(' ')
580
true
581

582
julia> isspace('\\x20')
583
true
584
```
585
"""
586
@inline isspace(c::AbstractChar) =
2,226,293✔
587
    c == ' ' || '\t' <= c <= '\r' || c == '\u85' ||
588
    '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
589

590
"""
591
    isprint(c::AbstractChar)::Bool
592

593
Tests whether a character is printable, including spaces, but not a control character.
594

595
# Examples
596
```jldoctest
597
julia> isprint('\\x01')
598
false
599

600
julia> isprint('A')
601
true
602
```
603
"""
604
isprint(c::AbstractChar) = UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS
116,588,416✔
605

606
# true in principal if a printer would use ink
607

608
"""
609
    isxdigit(c::AbstractChar)::Bool
610

611
Test whether a character is a valid hexadecimal digit. Note that this does not
612
include `x` (as in the standard `0x` prefix).
613

614
# Examples
615
```jldoctest
616
julia> isxdigit('a')
617
true
618

619
julia> isxdigit('x')
620
false
621
```
622
"""
623
isxdigit(c::AbstractChar) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F'
4✔
624

625
## uppercase, lowercase, and titlecase transformations ##
626

627
"""
628
    uppercase(s::AbstractString)
629

630
Return `s` with all characters converted to uppercase.
631

632
See also [`lowercase`](@ref), [`titlecase`](@ref), [`uppercasefirst`](@ref).
633

634
# Examples
635
```jldoctest
636
julia> uppercase("Julia")
637
"JULIA"
638
```
639
"""
640
uppercase(s::AbstractString) = map(uppercase, s)
671✔
641
uppercase(s::AnnotatedString) = annotated_chartransform(uppercase, s)
2✔
642

643
"""
644
    lowercase(s::AbstractString)
645

646
Return `s` with all characters converted to lowercase.
647

648
See also [`uppercase`](@ref), [`titlecase`](@ref), [`lowercasefirst`](@ref).
649

650
# Examples
651
```jldoctest
652
julia> lowercase("STRINGS AND THINGS")
653
"strings and things"
654
```
655
"""
656
lowercase(s::AbstractString) = map(lowercase, s)
10,245✔
657
lowercase(s::AnnotatedString) = annotated_chartransform(lowercase, s)
2✔
658

659
"""
660
    titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true)::String
661

662
Capitalize the first character of each word in `s`;
663
if `strict` is true, every other character is
664
converted to lowercase, otherwise they are left unchanged.
665
By default, all non-letters beginning a new grapheme are considered as word separators;
666
a predicate can be passed as the `wordsep` keyword to determine
667
which characters should be considered as word separators.
668
See also [`uppercasefirst`](@ref) to capitalize only the first
669
character in `s`.
670

671
See also [`uppercase`](@ref), [`lowercase`](@ref), [`uppercasefirst`](@ref).
672

673
# Examples
674
```jldoctest
675
julia> titlecase("the JULIA programming language")
676
"The Julia Programming Language"
677

678
julia> titlecase("ISS - international space station", strict=false)
679
"ISS - International Space Station"
680

681
julia> titlecase("a-a b-b", wordsep = c->c==' ')
682
"A-a B-b"
683
```
684
"""
685
function titlecase(s::AbstractString; wordsep::Function = !isletter, strict::Bool=true)
44✔
686
    startword = true
22✔
687
    state = Ref{Int32}(0)
22✔
688
    c0 = eltype(s)(0x00000000)
22✔
689
    b = IOBuffer()
22✔
690
    for c in s
39✔
691
        # Note: It would be better to have a word iterator following UAX#29,
692
        # similar to our grapheme iterator, but utf8proc does not yet have
693
        # this information.  At the very least we shouldn't break inside graphemes.
694
        if isgraphemebreak!(state, c0, c) && wordsep(c)
181✔
695
            print(b, c)
15✔
696
            startword = true
15✔
697
        else
698
            print(b, startword ? titlecase(c) : strict ? lowercase(c) : c)
239✔
699
            startword = false
135✔
700
        end
701
        c0 = c
150✔
702
    end
265✔
703
    return takestring!(b)
22✔
704
end
705

706
# TODO: improve performance characteristics, room for a ~10x improvement.
707
function titlecase(s::AnnotatedString; wordsep::Function = !isletter, strict::Bool=true)
2✔
708
    initial_state = (; startword = true, state = Ref{Int32}(0),
2✔
709
             c0 = eltype(s)(zero(UInt32)), wordsep, strict)
710
    annotated_chartransform(s, initial_state) do c, state
2✔
711
        if isgraphemebreak!(state.state, state.c0, c) && state.wordsep(c)
82✔
712
            state = Base.setindex(state, true, :startword)
8✔
713
            cnew = c
8✔
714
        else
715
            cnew = state.startword ? titlecase(c) : state.strict ? lowercase(c) : c
106✔
716
            state = Base.setindex(state, false, :startword)
58✔
717
        end
718
        state = Base.setindex(state, c, :c0)
66✔
719
        cnew, state
66✔
720
    end
721
end
722

723
"""
724
    uppercasefirst(s::AbstractString)::String
725

726
Return `s` with the first character converted to uppercase (technically "title
727
case" for Unicode). See also [`titlecase`](@ref) to capitalize the first
728
character of every word in `s`.
729

730
See also [`lowercasefirst`](@ref), [`uppercase`](@ref), [`lowercase`](@ref),
731
[`titlecase`](@ref).
732

733
# Examples
734
```jldoctest
735
julia> uppercasefirst("python")
736
"Python"
737
```
738
"""
739
function uppercasefirst(s::AbstractString)
534✔
740
    isempty(s) && return ""
534✔
741
    c = s[1]
1,058✔
742
    c′ = titlecase(c)
532✔
743
    c == c′ ? convert(String, s) :
532✔
744
    string(c′, SubString(s, nextind(s, 1)))
745
end
746

747
# TODO: improve performance characteristics, room for a ~5x improvement.
748
function uppercasefirst(s::AnnotatedString)
2✔
749
    annotated_chartransform(s, true) do c, state
2✔
750
        if state
66✔
751
            (titlecase(c), false)
2✔
752
        else
753
            (c, state)
64✔
754
        end
755
    end
756
end
757

758
"""
759
    lowercasefirst(s::AbstractString)
760

761
Return `s` with the first character converted to lowercase.
762

763
See also [`uppercasefirst`](@ref), [`uppercase`](@ref), [`lowercase`](@ref),
764
[`titlecase`](@ref).
765

766
# Examples
767
```jldoctest
768
julia> lowercasefirst("Julia")
769
"julia"
770
```
771
"""
772
function lowercasefirst(s::AbstractString)
11✔
773
    isempty(s) && return ""
11✔
774
    c = s[1]
14✔
775
    c′ = lowercase(c)
11✔
776
    c == c′ ? convert(String, s) :
9✔
777
    string(c′, SubString(s, nextind(s, 1)))
778
end
779

780
# TODO: improve performance characteristics, room for a ~5x improvement.
781
function lowercasefirst(s::AnnotatedString)
2✔
782
    annotated_chartransform(s, true) do c, state
2✔
783
        if state
66✔
784
            (lowercase(c), false)
2✔
785
        else
786
            (c, state)
64✔
787
        end
788
    end
789
end
790

791
############################################################################
792
# iterators for grapheme segmentation
793

794
isgraphemebreak(c1::AbstractChar, c2::AbstractChar) =
3✔
795
    ismalformed(c1) || ismalformed(c2) ||
796
    ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
797

798
# Stateful grapheme break required by Unicode-9 rules: the string
799
# must be processed in sequence, with state initialized to Ref{Int32}(0).
800
# Requires utf8proc v2.0 or later.
801
@inline function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar)
162✔
802
    if ismalformed(c1) || ismalformed(c2)
1,301✔
803
        state[] = 0
1✔
804
        return true
1✔
805
    end
806
    ccall(:utf8proc_grapheme_break_stateful, Bool,
651✔
807
          (UInt32, UInt32, Ref{Int32}), c1, c2, state)
808
end
809

810
struct GraphemeIterator{S<:AbstractString}
811
    s::S # original string (for generation of SubStrings)
147✔
812
end
813

814
# Documented in Unicode module
815
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
1✔
816

817
eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}
31✔
818
eltype(::Type{GraphemeIterator{SubString{S}}}) where {S} = SubString{S}
2✔
819

820
function length(g::GraphemeIterator{S}) where {S}
66✔
821
    c0 = eltype(S)(0x00000000)
66✔
822
    n = 0
66✔
823
    state = Ref{Int32}(0)
66✔
824
    for c in g.s
87✔
825
        n += isgraphemebreak!(state, c0, c)
410✔
826
        c0 = c
258✔
827
    end
324✔
828
    return n
66✔
829
end
830

831
function iterate(g::GraphemeIterator, i_=(Int32(0),firstindex(g.s)))
145✔
832
    s = g.s
201✔
833
    statei, i = i_
145✔
834
    state = Ref{Int32}(statei)
145✔
835
    j = i
145✔
836
    y = iterate(s, i)
189✔
837
    y === nothing && return nothing
145✔
838
    c0, k = y
113✔
839
    while k <= ncodeunits(s) # loop until next grapheme is s[i:j]
167✔
840
        c, ℓ = iterate(s, k)::NTuple{2,Any}
158✔
841
        isgraphemebreak!(state, c0, c) && break
199✔
842
        j = k
54✔
843
        k = ℓ
54✔
844
        c0 = c
54✔
845
    end
54✔
846
    return (SubString(s, i, j), (state[], k))
113✔
847
end
848

849
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
28✔
850
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
1✔
851
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
70✔
852

853
show(io::IO, g::GraphemeIterator{S}) where {S} = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
1✔
854

855
############################################################################
856

857
end # module
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc