• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

OSGeo / gdal / 14945877415

10 May 2025 01:37PM UTC coverage: 70.838% (+0.004%) from 70.834%
14945877415

Pull #12331

github

web-flow
Merge e74fde6a5 into a1ee70739
Pull Request #12331: GDALG: do not Open() in update mode, and 100% code coverage

12 of 12 new or added lines in 2 files covered. (100.0%)

73 existing lines in 35 files now uncovered.

565410 of 798178 relevant lines covered (70.84%)

234821.49 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.0
/port/cpl_recode.cpp
1
/**********************************************************************
2
 *
3
 * Name:     cpl_recode.cpp
4
 * Project:  CPL - Common Portability Library
5
 * Purpose:  Character set recoding and char/wchar_t conversions.
6
 * Author:   Andrey Kiselev, dron@ak4719.spb.edu
7
 *
8
 **********************************************************************
9
 * Copyright (c) 2011, Andrey Kiselev <dron@ak4719.spb.edu>
10
 * Copyright (c) 2008, Frank Warmerdam
11
 * Copyright (c) 2011-2014, Even Rouault <even dot rouault at spatialys.com>
12
 *
13
 * Permission to use, copy, modify, and distribute this software for any
14
 * purpose with or without fee is hereby granted, provided that the above
15
 * copyright notice and this permission notice appear in all copies.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
18
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
19
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
20
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
22
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
23
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24
 **********************************************************************/
25

26
#include "cpl_port.h"
27
#include "cpl_string.h"
28

29
#include <cstring>
30

31
#include "cpl_conv.h"
32
#include "cpl_character_sets.h"
33

34
#include "utf8.h"
35

36
#ifdef CPL_RECODE_ICONV
37
extern void CPLClearRecodeIconvWarningFlags();
38
extern char *CPLRecodeIconv(const char *, const char *,
39
                            const char *) CPL_RETURNS_NONNULL;
40
extern char *CPLRecodeFromWCharIconv(const wchar_t *, const char *,
41
                                     const char *);
42
extern wchar_t *CPLRecodeToWCharIconv(const char *, const char *, const char *);
43
#endif  // CPL_RECODE_ICONV
44

45
extern void CPLClearRecodeStubWarningFlags();
46
extern char *CPLRecodeStub(const char *, const char *,
47
                           const char *) CPL_RETURNS_NONNULL;
48
extern char *CPLRecodeFromWCharStub(const wchar_t *, const char *,
49
                                    const char *);
50
extern wchar_t *CPLRecodeToWCharStub(const char *, const char *, const char *);
51
extern int CPLIsUTF8Stub(const char *, int);
52

53
/************************************************************************/
54
/*                             CPLRecode()                              */
55
/************************************************************************/
56

57
/**
58
 * Convert a string from a source encoding to a destination encoding.
59
 *
60
 * The only guaranteed supported encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
61
 * and CPL_ENC_ISO8859_1. Currently, the following conversions are supported :
62
 * <ul>
63
 *  <li>CPL_ENC_ASCII -> CPL_ENC_UTF8 or CPL_ENC_ISO8859_1 (no conversion in
64
 *  fact)</li>
65
 *  <li>CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8</li>
66
 *  <li>CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1</li>
67
 * </ul>
68
 *
69
 * If an error occurs an error may, or may not be posted with CPLError().
70
 *
71
 * @param pszSource a NULL terminated string.
72
 * @param pszSrcEncoding the source encoding.
73
 * @param pszDstEncoding the destination encoding.
74
 *
75
 * @return a NULL terminated string which should be freed with CPLFree().
76
 *
77
 * @since GDAL 1.6.0
78
 */
79

80
char CPL_DLL *CPLRecode(const char *pszSource, const char *pszSrcEncoding,
784,186✔
81
                        const char *pszDstEncoding)
82

83
{
84
    /* -------------------------------------------------------------------- */
85
    /*      Handle a few common short cuts.                                 */
86
    /* -------------------------------------------------------------------- */
87
    if (EQUAL(pszSrcEncoding, pszDstEncoding))
784,186✔
88
        return CPLStrdup(pszSource);
171✔
89

90
    if (EQUAL(pszSrcEncoding, CPL_ENC_ASCII) &&
784,015✔
91
        (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
×
92
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
×
UNCOV
93
        return CPLStrdup(pszSource);
×
94

95
    // A few hard coded CPxxx/ISO-8859-x to UTF-8 tables
96
    if (EQUAL(pszDstEncoding, CPL_ENC_UTF8) &&
1,518,070✔
97
        CPLGetConversionTableToUTF8(pszSrcEncoding))
734,057✔
98
    {
99
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
21,286✔
100
    }
101

102
#ifdef CPL_RECODE_ICONV
103
    /* -------------------------------------------------------------------- */
104
    /*      CPL_ENC_ISO8859_1 -> CPL_ENC_UTF8                               */
105
    /*      and CPL_ENC_UTF8 -> CPL_ENC_ISO8859_1 conversions are handled   */
106
    /*      very well by the stub implementation which is faster than the   */
107
    /*      iconv() route. Use a stub for these two ones and iconv()        */
108
    /*      everything else.                                                */
109
    /* -------------------------------------------------------------------- */
110
    if ((EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1) &&
762,729✔
111
         EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
712,634✔
112
        (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
50,095✔
113
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
49,958✔
114
    {
115
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
762,446✔
116
    }
117
#ifdef _WIN32
118
    else if (((EQUAL(pszSrcEncoding, "CP_ACP") ||
119
               EQUAL(pszSrcEncoding, "CP_OEMCP")) &&
120
              EQUAL(pszDstEncoding, CPL_ENC_UTF8)) ||
121
             (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) &&
122
              (EQUAL(pszDstEncoding, "CP_ACP") ||
123
               EQUAL(pszDstEncoding, "CP_OEMCP"))))
124
    {
125
        return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
126
    }
127
#endif
128
    else
129
    {
130
        return CPLRecodeIconv(pszSource, pszSrcEncoding, pszDstEncoding);
283✔
131
    }
132
#else   // CPL_RECODE_STUB
133
    return CPLRecodeStub(pszSource, pszSrcEncoding, pszDstEncoding);
134
#endif  // CPL_RECODE_ICONV
135
}
136

137
/************************************************************************/
138
/*                         CPLRecodeFromWChar()                         */
139
/************************************************************************/
140

141
/**
142
 * Convert wchar_t string to UTF-8.
143
 *
144
 * Convert a wchar_t string into a multibyte utf-8 string.  The only
145
 * guaranteed supported source encoding is CPL_ENC_UCS2, and the only
146
 * guaranteed supported destination encodings are CPL_ENC_UTF8, CPL_ENC_ASCII
147
 * and CPL_ENC_ISO8859_1.  In some cases (i.e. using iconv()) other encodings
148
 * may also be supported.
149
 *
150
 * Note that the wchar_t type varies in size on different systems. On
151
 * win32 it is normally 2 bytes, and on UNIX 4 bytes.
152
 *
153
 * If an error occurs an error may, or may not be posted with CPLError().
154
 *
155
 * @param pwszSource the source wchar_t string, terminated with a 0 wchar_t.
156
 * @param pszSrcEncoding the source encoding, typically CPL_ENC_UCS2.
157
 * @param pszDstEncoding the destination encoding, typically CPL_ENC_UTF8.
158
 *
159
 * @return a zero terminated multi-byte string which should be freed with
160
 * CPLFree(), or NULL if an error occurs.
161
 *
162
 * @since GDAL 1.6.0
163
 */
164

165
char CPL_DLL *CPLRecodeFromWChar(const wchar_t *pwszSource,
129,984✔
166
                                 const char *pszSrcEncoding,
167
                                 const char *pszDstEncoding)
168

169
{
170
#ifdef CPL_RECODE_ICONV
171
    /* -------------------------------------------------------------------- */
172
    /*      Conversions from CPL_ENC_UCS2                                   */
173
    /*      to CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well   */
174
    /*      handled by the stub implementation.                             */
175
    /* -------------------------------------------------------------------- */
176
    if ((EQUAL(pszSrcEncoding, CPL_ENC_UCS2) ||
129,984✔
177
         EQUAL(pszSrcEncoding, "WCHAR_T")) &&
1,360✔
178
        (EQUAL(pszDstEncoding, CPL_ENC_UTF8) ||
129,983✔
179
         EQUAL(pszDstEncoding, CPL_ENC_ASCII) ||
×
180
         EQUAL(pszDstEncoding, CPL_ENC_ISO8859_1)))
×
181
    {
182
        return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding,
129,983✔
183
                                      pszDstEncoding);
129,983✔
184
    }
185

186
    return CPLRecodeFromWCharIconv(pwszSource, pszSrcEncoding, pszDstEncoding);
1✔
187

188
#else   // CPL_RECODE_STUB
189
    return CPLRecodeFromWCharStub(pwszSource, pszSrcEncoding, pszDstEncoding);
190
#endif  // CPL_RECODE_ICONV
191
}
192

193
/************************************************************************/
194
/*                          CPLRecodeToWChar()                          */
195
/************************************************************************/
196

197
/**
198
 * Convert UTF-8 string to a wchar_t string.
199
 *
200
 * Convert a 8bit, multi-byte per character input string into a wide
201
 * character (wchar_t) string.  The only guaranteed supported source encodings
202
 * are CPL_ENC_UTF8, CPL_ENC_ASCII and CPL_ENC_ISO8869_1 (LATIN1).  The only
203
 * guaranteed supported destination encoding is CPL_ENC_UCS2.  Other source
204
 * and destination encodings may be supported depending on the underlying
205
 * implementation.
206
 *
207
 * Note that the wchar_t type varies in size on different systems. On
208
 * win32 it is normally 2 bytes, and on UNIX 4 bytes.
209
 *
210
 * If an error occurs an error may, or may not be posted with CPLError().
211
 *
212
 * @param pszSource input multi-byte character string.
213
 * @param pszSrcEncoding source encoding, typically CPL_ENC_UTF8.
214
 * @param pszDstEncoding destination encoding, typically CPL_ENC_UCS2.
215
 *
216
 * @return the zero terminated wchar_t string (to be freed with CPLFree()) or
217
 * NULL on error.
218
 *
219
 * @since GDAL 1.6.0
220
 */
221

222
wchar_t CPL_DLL *CPLRecodeToWChar(const char *pszSource,
40,833✔
223
                                  const char *pszSrcEncoding,
224
                                  const char *pszDstEncoding)
225

226
{
227
#ifdef CPL_RECODE_ICONV
228
    /* -------------------------------------------------------------------- */
229
    /*      Conversions to CPL_ENC_UCS2                                     */
230
    /*      from CPL_ENC_UTF8, CPL_ENC_ISO8859_1 and CPL_ENC_ASCII are well */
231
    /*      handled by the stub implementation.                             */
232
    /* -------------------------------------------------------------------- */
233
    if ((EQUAL(pszDstEncoding, CPL_ENC_UCS2) ||
40,833✔
234
         EQUAL(pszDstEncoding, "WCHAR_T")) &&
×
235
        (EQUAL(pszSrcEncoding, CPL_ENC_UTF8) ||
40,833✔
236
         EQUAL(pszSrcEncoding, CPL_ENC_ASCII) ||
×
237
         EQUAL(pszSrcEncoding, CPL_ENC_ISO8859_1)))
×
238
    {
239
        return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
40,833✔
240
    }
241

242
    return CPLRecodeToWCharIconv(pszSource, pszSrcEncoding, pszDstEncoding);
×
243

244
#else   // CPL_RECODE_STUB
245
    return CPLRecodeToWCharStub(pszSource, pszSrcEncoding, pszDstEncoding);
246
#endif  // CPL_RECODE_ICONV
247
}
248

249
/************************************************************************/
250
/*                               CPLIsASCII()                           */
251
/************************************************************************/
252

253
/**
254
 * Test if a string is encoded as ASCII.
255
 *
256
 * @param pabyData input string to test
257
 * @param nLen length of the input string, or -1 if the function must compute
258
 *             the string length. In which case it must be null terminated.
259
 * @return true if the string is encoded as ASCII. false otherwise
260
 *
261
 * @since GDAL 3.6.0
262
 */
263
bool CPLIsASCII(const char *pabyData, size_t nLen)
768✔
264
{
265
    if (nLen == static_cast<size_t>(-1))
768✔
266
        nLen = strlen(pabyData);
21✔
267
    for (size_t i = 0; i < nLen; ++i)
13,590✔
268
    {
269
        if (static_cast<unsigned char>(pabyData[i]) > 127)
12,824✔
270
            return false;
2✔
271
    }
272
    return true;
766✔
273
}
274

275
/************************************************************************/
276
/*                          CPLForceToASCII()                           */
277
/************************************************************************/
278

279
/**
280
 * Return a new string that is made only of ASCII characters. If non-ASCII
281
 * characters are found in the input string, they will be replaced by the
282
 * provided replacement character.
283
 *
284
 * This function does not make any assumption on the encoding of the input
285
 * string (except it must be nul-terminated if nLen equals -1, or have at
286
 * least nLen bytes otherwise). CPLUTF8ForceToASCII() can be used instead when
287
 * the input string is known to be UTF-8 encoded.
288
 *
289
 * @param pabyData input string to test
290
 * @param nLen length of the input string, or -1 if the function must compute
291
 *             the string length. In which case it must be null terminated.
292

293
 * @param chReplacementChar character which will be used when the input stream
294
 *                          contains a non ASCII character. Must be valid ASCII!
295
 *
296
 * @return a new string that must be freed with CPLFree().
297
 *
298
 * @since GDAL 1.7.0
299
 */
300
char *CPLForceToASCII(const char *pabyData, int nLen, char chReplacementChar)
5✔
301
{
302
    const size_t nRealLen =
5✔
303
        (nLen >= 0) ? static_cast<size_t>(nLen) : strlen(pabyData);
5✔
304
    char *pszOutputString = static_cast<char *>(CPLMalloc(nRealLen + 1));
5✔
305
    const char *pszPtr = pabyData;
5✔
306
    const char *pszEnd = pabyData + nRealLen;
5✔
307
    size_t i = 0;
5✔
308
    while (pszPtr != pszEnd)
19✔
309
    {
310
        if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
14✔
311
        {
312
            pszOutputString[i] = chReplacementChar;
3✔
313
            ++pszPtr;
3✔
314
            ++i;
3✔
315
        }
316
        else
317
        {
318
            pszOutputString[i] = *pszPtr;
11✔
319
            ++pszPtr;
11✔
320
            ++i;
11✔
321
        }
322
    }
323
    pszOutputString[i] = '\0';
5✔
324
    return pszOutputString;
5✔
325
}
326

327
/************************************************************************/
328
/*                       CPLUTF8ForceToASCII()                          */
329
/************************************************************************/
330

331
/**
332
 * Return a new string that is made only of ASCII characters. If non-ASCII
333
 * characters are found in the input string, for which an "equivalent" ASCII
334
 * character is not found, they will be replaced by the provided replacement
335
 * character.
336
 *
337
 * This function is aware of https://en.wikipedia.org/wiki/Latin-1_Supplement
338
 * and https://en.wikipedia.org/wiki/Latin_Extended-A to provide sensible
339
 * replacements for accented characters.
340

341
 * @param pszStr NUL-terminated UTF-8 string.
342
 * @param chReplacementChar character which will be used when the input stream
343
 *                          contains a non ASCII character that cannot be
344
 *                          substituted with an equivalent ASCII character.
345
 *                          Must be valid ASCII!
346
 *
347
 * @return a new string that must be freed with CPLFree().
348
 *
349
 * @since GDAL 3.9
350
 */
351
char *CPLUTF8ForceToASCII(const char *pszStr, char chReplacementChar)
17✔
352
{
353
    static const struct
354
    {
355
        short nCodePoint;
356
        char chFirst;
357
        char chSecond;
358
    } aLatinCharacters[] = {
359
        // https://en.wikipedia.org/wiki/Latin-1_Supplement
360
        {0xC0, 'A', 0},    // Latin Capital Letter A with grave
361
        {0xC1, 'A', 0},    // Latin Capital letter A with acute
362
        {0xC2, 'A', 0},    // Latin Capital letter A with circumflex
363
        {0xC3, 'A', 0},    // Latin Capital letter A with tilde
364
        {0xC4, 'A', 0},    // Latin Capital letter A with diaeresis
365
        {0xC5, 'A', 0},    // Latin Capital letter A with ring above
366
        {0xC6, 'A', 'E'},  // Latin Capital letter AE
367
        {0xC7, 'C', 0},    // Latin Capital letter C with cedilla
368
        {0xC8, 'E', 0},    // Latin Capital letter E with grave
369
        {0xC9, 'E', 0},    // Latin Capital letter E with acute
370
        {0xCA, 'E', 0},    // Latin Capital letter E with circumflex
371
        {0xCB, 'E', 0},    // Latin Capital letter E with diaeresis
372
        {0xCC, 'I', 0},    // Latin Capital letter I with grave
373
        {0xCD, 'I', 0},    // Latin Capital letter I with acute
374
        {0xCE, 'I', 0},    // Latin Capital letter I with circumflex
375
        {0xCF, 'I', 0},    // Latin Capital letter I with diaeresis
376
        // { 0xD0, '?', 0 }, // Latin Capital letter Eth
377
        {0xD1, 'N', 0},  // Latin Capital letter N with tilde
378
        {0xD2, 'O', 0},  // Latin Capital letter O with grave
379
        {0xD3, 'O', 0},  // Latin Capital letter O with acute
380
        {0xD4, 'O', 0},  // Latin Capital letter O with circumflex
381
        {0xD5, 'O', 0},  // Latin Capital letter O with tilde
382
        {0xD6, 'O', 0},  // Latin Capital letter O with diaeresis
383
        {0xD8, 'O', 0},  // Latin Capital letter O with stroke
384
        {0xD9, 'U', 0},  // Latin Capital letter U with grave
385
        {0xDA, 'U', 0},  // Latin Capital letter U with acute
386
        {0xDB, 'U', 0},  // Latin Capital Letter U with circumflex
387
        {0xDC, 'U', 0},  // Latin Capital Letter U with diaeresis
388
        {0xDD, 'Y', 0},  // Latin Capital Letter Y with acute
389
        // { 0xDE, '?', 0 }, // Latin Capital Letter Thorn
390
        {0xDF, 'S', 'S'},  // Latin Small Letter sharp S
391
        {0xE0, 'a', 0},    // Latin Small Letter A with grave
392
        {0xE1, 'a', 0},    // Latin Small Letter A with acute
393
        {0xE2, 'a', 0},    // Latin Small Letter A with circumflex
394
        {0xE3, 'a', 0},    // Latin Small Letter A with tilde
395
        {0xE4, 'a', 0},    // Latin Small Letter A with diaeresis
396
        {0xE5, 'a', 0},    // Latin Small Letter A with ring above
397
        {0xE6, 'a', 'e'},  // Latin Small Letter AE
398
        {0xE7, 'c', 0},    // Latin Small Letter C with cedilla
399
        {0xE8, 'e', 0},    // Latin Small Letter E with grave
400
        {0xE9, 'e', 0},    // Latin Small Letter E with acute
401
        {0xEA, 'e', 0},    // Latin Small Letter E with circumflex
402
        {0xEB, 'e', 0},    // Latin Small Letter E with diaeresis
403
        {0xEC, 'i', 0},    // Latin Small Letter I with grave
404
        {0xED, 'i', 0},    // Latin Small Letter I with acute
405
        {0xEE, 'i', 0},    // Latin Small Letter I with circumflex
406
        {0xEF, 'i', 0},    // Latin Small Letter I with diaeresis
407
        // { 0xF0, '?', 0 }, // Latin Small Letter Eth
408
        {0xF1, 'n', 0},  // Latin Small Letter N with tilde
409
        {0xF2, 'o', 0},  // Latin Small Letter O with grave
410
        {0xF3, 'o', 0},  // Latin Small Letter O with acute
411
        {0xF4, 'o', 0},  // Latin Small Letter O with circumflex
412
        {0xF5, 'o', 0},  // Latin Small Letter O with tilde
413
        {0xF6, 'o', 0},  // Latin Small Letter O with diaeresis
414
        {0xF8, 'o', 0},  // Latin Small Letter O with stroke
415
        {0xF9, 'u', 0},  // Latin Small Letter U with grave
416
        {0xFA, 'u', 0},  // Latin Small Letter U with acute
417
        {0xFB, 'u', 0},  // Latin Small Letter U with circumflex
418
        {0xFC, 'u', 0},  // Latin Small Letter U with diaeresis
419
        {0xFD, 'y', 0},  // Latin Small Letter Y with acute
420
        // { 0xFE, '?', 0 }, // Latin Small Letter Thorn
421
        {0xFF, 'u', 0},  // Latin Small Letter Y with diaeresis
422

423
        // https://en.wikipedia.org/wiki/Latin_Extended-A
424
        {
425
            0x0100,
426
            'A',
427
            0,
428
        },  // Latin Capital letter A with macron
429
        {
430
            0x0101,
431
            'a',
432
            0,
433
        },  // Latin Small letter A with macron
434
        {
435
            0x0102,
436
            'A',
437
            0,
438
        },  // Latin Capital letter A with breve
439
        {
440
            0x0103,
441
            'a',
442
            0,
443
        },  // Latin Small letter A with breve
444
        {
445
            0x0104,
446
            'A',
447
            0,
448
        },  // Latin Capital letter A with ogonek
449
        {
450
            0x0105,
451
            'a',
452
            0,
453
        },  // Latin Small letter A with ogonek
454
        {
455
            0x0106,
456
            'C',
457
            0,
458
        },  // Latin Capital letter C with acute
459
        {
460
            0x0107,
461
            'c',
462
            0,
463
        },  // Latin Small letter C with acute
464
        {
465
            0x0108,
466
            'C',
467
            0,
468
        },  // Latin Capital letter C with circumflex
469
        {
470
            0x0109,
471
            'c',
472
            0,
473
        },  // Latin Small letter C with circumflex
474
        {
475
            0x010A,
476
            'C',
477
            0,
478
        },  // Latin Capital letter C with dot above
479
        {
480
            0x010B,
481
            'c',
482
            0,
483
        },  // Latin Small letter C with dot above
484
        {
485
            0x010C,
486
            'C',
487
            0,
488
        },  // Latin Capital letter C with caron
489
        {
490
            0x010D,
491
            'c',
492
            0,
493
        },  // Latin Small letter C with caron
494
        {
495
            0x010E,
496
            'D',
497
            0,
498
        },  // Latin Capital letter D with caron
499
        {
500
            0x010F,
501
            'd',
502
            0,
503
        },  // Latin Small letter D with caron
504
        {
505
            0x0110,
506
            'D',
507
            0,
508
        },  // Latin Capital letter D with stroke
509
        {
510
            0x0111,
511
            'd',
512
            0,
513
        },  // Latin Small letter D with stroke
514
        {
515
            0x0112,
516
            'E',
517
            0,
518
        },  // Latin Capital letter E with macron
519
        {
520
            0x0113,
521
            'e',
522
            0,
523
        },  // Latin Small letter E with macron
524
        {
525
            0x0114,
526
            'E',
527
            0,
528
        },  // Latin Capital letter E with breve
529
        {
530
            0x0115,
531
            'e',
532
            0,
533
        },  // Latin Small letter E with breve
534
        {
535
            0x0116,
536
            'E',
537
            0,
538
        },  // Latin Capital letter E with dot above
539
        {
540
            0x0117,
541
            'e',
542
            0,
543
        },  // Latin Small letter E with dot above
544
        {
545
            0x0118,
546
            'E',
547
            0,
548
        },  // Latin Capital letter E with ogonek
549
        {
550
            0x0119,
551
            'e',
552
            0,
553
        },  // Latin Small letter E with ogonek
554
        {
555
            0x011A,
556
            'E',
557
            0,
558
        },  // Latin Capital letter E with caron
559
        {
560
            0x011B,
561
            'e',
562
            0,
563
        },  // Latin Small letter E with caron
564
        {
565
            0x011C,
566
            'G',
567
            0,
568
        },  // Latin Capital letter G with circumflex
569
        {
570
            0x011D,
571
            'g',
572
            0,
573
        },  // Latin Small letter G with circumflex
574
        {
575
            0x011E,
576
            'G',
577
            0,
578
        },  // Latin Capital letter G with breve
579
        {
580
            0x011F,
581
            'g',
582
            0,
583
        },  // Latin Small letter G with breve
584
        {
585
            0x0120,
586
            'G',
587
            0,
588
        },  // Latin Capital letter G with dot above
589
        {
590
            0x0121,
591
            'g',
592
            0,
593
        },  // Latin Small letter G with dot above
594
        {
595
            0x0122,
596
            'G',
597
            0,
598
        },  // Latin Capital letter G with cedilla
599
        {
600
            0x0123,
601
            'g',
602
            0,
603
        },  // Latin Small letter G with cedilla
604
        {
605
            0x0124,
606
            'H',
607
            0,
608
        },  // Latin Capital letter H with circumflex
609
        {
610
            0x0125,
611
            'h',
612
            0,
613
        },  // Latin Small letter H with circumflex
614
        {
615
            0x0126,
616
            'H',
617
            0,
618
        },  // Latin Capital letter H with stroke
619
        {
620
            0x0127,
621
            'h',
622
            0,
623
        },  // Latin Small letter H with stroke
624
        {
625
            0x0128,
626
            'I',
627
            0,
628
        },  // Latin Capital letter I with tilde
629
        {
630
            0x0129,
631
            'i',
632
            0,
633
        },  // Latin Small letter I with tilde
634
        {
635
            0x012A,
636
            'I',
637
            0,
638
        },  // Latin Capital letter I with macron
639
        {
640
            0x012B,
641
            'i',
642
            0,
643
        },  // Latin Small letter I with macron
644
        {
645
            0x012C,
646
            'I',
647
            0,
648
        },  // Latin Capital letter I with breve
649
        {
650
            0x012D,
651
            'i',
652
            0,
653
        },  // Latin Small letter I with breve
654
        {
655
            0x012E,
656
            'I',
657
            0,
658
        },  // Latin Capital letter I with ogonek
659
        {
660
            0x012F,
661
            'i',
662
            0,
663
        },  // Latin Small letter I with ogonek
664
        {
665
            0x0130,
666
            'I',
667
            0,
668
        },  // Latin Capital letter I with dot above
669
        {
670
            0x0131,
671
            'i',
672
            0,
673
        },  // Latin Small letter dotless I
674
        {
675
            0x0132,
676
            'I',
677
            'J',
678
        },  // Latin Capital Ligature IJ
679
        {
680
            0x0133,
681
            'i',
682
            'j',
683
        },  // Latin Small Ligature IJ
684
        {
685
            0x0134,
686
            'J',
687
            0,
688
        },  // Latin Capital letter J with circumflex
689
        {
690
            0x0135,
691
            'j',
692
            0,
693
        },  // Latin Small letter J with circumflex
694
        {
695
            0x0136,
696
            'K',
697
            0,
698
        },  // Latin Capital letter K with cedilla
699
        {
700
            0x0137,
701
            'k',
702
            0,
703
        },  // Latin Small letter K with cedilla
704
        {
705
            0x0138,
706
            'k',
707
            0,
708
        },  // Latin Small letter Kra
709
        {
710
            0x0139,
711
            'L',
712
            0,
713
        },  // Latin Capital letter L with acute
714
        {
715
            0x013A,
716
            'l',
717
            0,
718
        },  // Latin Small letter L with acute
719
        {
720
            0x013B,
721
            'L',
722
            0,
723
        },  // Latin Capital letter L with cedilla
724
        {
725
            0x013C,
726
            'l',
727
            0,
728
        },  // Latin Small letter L with cedilla
729
        {
730
            0x013D,
731
            'L',
732
            0,
733
        },  // Latin Capital letter L with caron
734
        {
735
            0x013E,
736
            'l',
737
            0,
738
        },  // Latin Small letter L with caron
739
        {
740
            0x013F,
741
            'L',
742
            0,
743
        },  // Latin Capital letter L with middle dot
744
        {
745
            0x0140,
746
            'l',
747
            0,
748
        },  // Latin Small letter L with middle dot
749
        {
750
            0x0141,
751
            'L',
752
            0,
753
        },  // Latin Capital letter L with stroke
754
        {
755
            0x0142,
756
            'l',
757
            0,
758
        },  // Latin Small letter L with stroke
759
        {
760
            0x0143,
761
            'N',
762
            0,
763
        },  // Latin Capital letter N with acute
764
        {
765
            0x0144,
766
            'n',
767
            0,
768
        },  // Latin Small letter N with acute
769
        {
770
            0x0145,
771
            'N',
772
            0,
773
        },  // Latin Capital letter N with cedilla
774
        {
775
            0x0146,
776
            'n',
777
            0,
778
        },  // Latin Small letter N with cedilla
779
        {
780
            0x0147,
781
            'N',
782
            0,
783
        },  // Latin Capital letter N with caron
784
        {
785
            0x0148,
786
            'n',
787
            0,
788
        },  // Latin Small letter N with caron
789
        // { 0x014A , '?' , 0, }, // Latin Capital letter Eng
790
        // { 0x014B , '?' , 0, }, // Latin Small letter Eng
791
        {
792
            0x014C,
793
            'O',
794
            0,
795
        },  // Latin Capital letter O with macron
796
        {
797
            0x014D,
798
            'o',
799
            0,
800
        },  // Latin Small letter O with macron
801
        {
802
            0x014E,
803
            'O',
804
            0,
805
        },  // Latin Capital letter O with breve
806
        {
807
            0x014F,
808
            'o',
809
            0,
810
        },  // Latin Small letter O with breve
811
        {
812
            0x0150,
813
            'O',
814
            0,
815
        },  // Latin Capital Letter O with double acute
816
        {
817
            0x0151,
818
            'o',
819
            0,
820
        },  // Latin Small Letter O with double acute
821
        {
822
            0x0152,
823
            'O',
824
            'E',
825
        },  // Latin Capital Ligature OE
826
        {
827
            0x0153,
828
            'o',
829
            'e',
830
        },  // Latin Small Ligature OE
831
        {
832
            0x0154,
833
            'R',
834
            0,
835
        },  // Latin Capital letter R with acute
836
        {
837
            0x0155,
838
            'r',
839
            0,
840
        },  // Latin Small letter R with acute
841
        {
842
            0x0156,
843
            'R',
844
            0,
845
        },  // Latin Capital letter R with cedilla
846
        {
847
            0x0157,
848
            'r',
849
            0,
850
        },  // Latin Small letter R with cedilla
851
        {
852
            0x0158,
853
            'R',
854
            0,
855
        },  // Latin Capital letter R with caron
856
        {
857
            0x0159,
858
            'r',
859
            0,
860
        },  // Latin Small letter R with caron
861
        {
862
            0x015A,
863
            'S',
864
            0,
865
        },  // Latin Capital letter S with acute
866
        {
867
            0x015B,
868
            's',
869
            0,
870
        },  // Latin Small letter S with acute
871
        {
872
            0x015C,
873
            'S',
874
            0,
875
        },  // Latin Capital letter S with circumflex
876
        {
877
            0x015D,
878
            's',
879
            0,
880
        },  // Latin Small letter S with circumflex
881
        {
882
            0x015E,
883
            'S',
884
            0,
885
        },  // Latin Capital letter S with cedilla
886
        {
887
            0x015F,
888
            's',
889
            0,
890
        },  // Latin Small letter S with cedilla
891
        {
892
            0x0160,
893
            'S',
894
            0,
895
        },  // Latin Capital letter S with caron
896
        {
897
            0x0161,
898
            's',
899
            0,
900
        },  // Latin Small letter S with caron
901
        {
902
            0x0162,
903
            'T',
904
            0,
905
        },  // Latin Capital letter T with cedilla
906
        {
907
            0x0163,
908
            't',
909
            0,
910
        },  // Latin Small letter T with cedilla
911
        {
912
            0x0164,
913
            'T',
914
            0,
915
        },  // Latin Capital letter T with caron
916
        {
917
            0x0165,
918
            't',
919
            0,
920
        },  // Latin Small letter T with caron
921
        {
922
            0x0166,
923
            'T',
924
            0,
925
        },  // Latin Capital letter T with stroke
926
        {
927
            0x0167,
928
            't',
929
            0,
930
        },  // Latin Small letter T with stroke
931
        {
932
            0x0168,
933
            'U',
934
            0,
935
        },  // Latin Capital letter U with tilde
936
        {
937
            0x0169,
938
            'u',
939
            0,
940
        },  // Latin Small letter U with tilde
941
        {
942
            0x016A,
943
            'U',
944
            0,
945
        },  // Latin Capital letter U with macron
946
        {
947
            0x016B,
948
            'u',
949
            0,
950
        },  // Latin Small letter U with macron
951
        {
952
            0x016C,
953
            'U',
954
            0,
955
        },  // Latin Capital letter U with breve
956
        {
957
            0x016D,
958
            'u',
959
            0,
960
        },  // Latin Small letter U with breve
961
        {
962
            0x016E,
963
            'U',
964
            0,
965
        },  // Latin Capital letter U with ring above
966
        {
967
            0x016F,
968
            'u',
969
            0,
970
        },  // Latin Small letter U with ring above
971
        {
972
            0x0170,
973
            'U',
974
            0,
975
        },  // Latin Capital Letter U with double acute
976
        {
977
            0x0171,
978
            'u',
979
            0,
980
        },  // Latin Small Letter U with double acute
981
        {
982
            0x0172,
983
            'U',
984
            0,
985
        },  // Latin Capital letter U with ogonek
986
        {
987
            0x0173,
988
            'u',
989
            0,
990
        },  // Latin Small letter U with ogonek
991
        {
992
            0x0174,
993
            'W',
994
            0,
995
        },  // Latin Capital letter W with circumflex
996
        {
997
            0x0175,
998
            'w',
999
            0,
1000
        },  // Latin Small letter W with circumflex
1001
        {
1002
            0x0176,
1003
            'Y',
1004
            0,
1005
        },  // Latin Capital letter Y with circumflex
1006
        {
1007
            0x0177,
1008
            'y',
1009
            0,
1010
        },  // Latin Small letter Y with circumflex
1011
        {
1012
            0x0178,
1013
            'Y',
1014
            0,
1015
        },  // Latin Capital letter Y with diaeresis
1016
        {
1017
            0x0179,
1018
            'Z',
1019
            0,
1020
        },  // Latin Capital letter Z with acute
1021
        {
1022
            0x017A,
1023
            'z',
1024
            0,
1025
        },  // Latin Small letter Z with acute
1026
        {
1027
            0x017B,
1028
            'Z',
1029
            0,
1030
        },  // Latin Capital letter Z with dot above
1031
        {
1032
            0x017C,
1033
            'z',
1034
            0,
1035
        },  // Latin Small letter Z with dot above
1036
        {
1037
            0x017D,
1038
            'Z',
1039
            0,
1040
        },  // Latin Capital letter Z with caron
1041
        {
1042
            0x017E,
1043
            'z',
1044
            0,
1045
        },  // Latin Small letter Z with caron
1046
    };
1047

1048
    const size_t nLen = strlen(pszStr);
17✔
1049
    char *pszOutputString = static_cast<char *>(CPLMalloc(nLen + 1));
17✔
1050
    const char *pszPtr = pszStr;
17✔
1051
    const char *pszEnd = pszStr + nLen;
17✔
1052
    size_t i = 0;
17✔
1053
    while (pszPtr != pszEnd)
255✔
1054
    {
1055
        if (*reinterpret_cast<const unsigned char *>(pszPtr) > 127)
240✔
1056
        {
1057
            utf8_int32_t codepoint;
1058
            if (pszPtr + utf8codepointcalcsize(
190✔
1059
                             reinterpret_cast<const utf8_int8_t *>(pszPtr)) >
190✔
1060
                pszEnd)
1061
                break;
2✔
1062
            auto pszNext = reinterpret_cast<const char *>(utf8codepoint(
188✔
1063
                reinterpret_cast<const utf8_int8_t *>(pszPtr), &codepoint));
1064
            char ch = chReplacementChar;
188✔
1065
            for (const auto &latin1char : aLatinCharacters)
17,075✔
1066
            {
1067
                if (codepoint == latin1char.nCodePoint)
17,073✔
1068
                {
1069
                    pszOutputString[i] = latin1char.chFirst;
186✔
1070
                    ++i;
186✔
1071
                    if (latin1char.chSecond)
186✔
1072
                    {
1073
                        pszOutputString[i] = latin1char.chSecond;
7✔
1074
                        ++i;
7✔
1075
                    }
1076
                    ch = 0;
186✔
1077
                    break;
186✔
1078
                }
1079
            }
1080
            if (ch)
188✔
1081
            {
1082
                pszOutputString[i] = ch;
2✔
1083
                ++i;
2✔
1084
            }
1085
            pszPtr = pszNext;
188✔
1086
        }
1087
        else
1088
        {
1089
            pszOutputString[i] = *pszPtr;
50✔
1090
            ++pszPtr;
50✔
1091
            ++i;
50✔
1092
        }
1093
    }
1094
    pszOutputString[i] = '\0';
17✔
1095
    return pszOutputString;
17✔
1096
}
1097

1098
/************************************************************************/
1099
/*                        CPLEncodingCharSize()                         */
1100
/************************************************************************/
1101

1102
/**
1103
 * Return bytes per character for encoding.
1104
 *
1105
 * This function returns the size in bytes of the smallest character
1106
 * in this encoding.  For fixed width encodings (ASCII, UCS-2, UCS-4) this
1107
 * is straight forward.  For encodings like UTF8 and UTF16 which represent
1108
 * some characters as a sequence of atomic character sizes the function
1109
 * still returns the atomic character size (1 for UTF8, 2 for UTF16).
1110
 *
1111
 * This function will return the correct value for well known encodings
1112
 * with corresponding CPL_ENC_ values.  It may not return the correct value
1113
 * for other encodings even if they are supported by the underlying iconv
1114
 * or windows transliteration services.  Hopefully it will improve over time.
1115
 *
1116
 * @param pszEncoding the name of the encoding.
1117
 *
1118
 * @return the size of a minimal character in bytes or -1 if the size is
1119
 * unknown.
1120
 */
1121

1122
int CPLEncodingCharSize(const char *pszEncoding)
1✔
1123

1124
{
1125
    if (EQUAL(pszEncoding, CPL_ENC_UTF8))
1✔
1126
        return 1;
×
1127
    else if (EQUAL(pszEncoding, CPL_ENC_UTF16) ||
1✔
1128
             EQUAL(pszEncoding, "UTF-16LE"))
1✔
1129
        return 2;
1✔
1130
    else if (EQUAL(pszEncoding, CPL_ENC_UCS2) || EQUAL(pszEncoding, "UCS-2LE"))
×
1131
        return 2;
×
1132
    else if (EQUAL(pszEncoding, CPL_ENC_UCS4))
×
1133
        return 4;
×
1134
    else if (EQUAL(pszEncoding, CPL_ENC_ASCII))
×
1135
        return 1;
×
1136
    else if (STARTS_WITH_CI(pszEncoding, "ISO-8859-"))
×
1137
        return 1;
×
1138

1139
    return -1;
×
1140
}
1141

1142
/************************************************************************/
1143
/*                    CPLClearRecodeWarningFlags()                      */
1144
/************************************************************************/
1145

1146
void CPLClearRecodeWarningFlags()
10,786✔
1147
{
1148
#ifdef CPL_RECODE_ICONV
1149
    CPLClearRecodeIconvWarningFlags();
10,786✔
1150
#endif
1151
    CPLClearRecodeStubWarningFlags();
10,786✔
1152
}
10,786✔
1153

1154
/************************************************************************/
1155
/*                         CPLStrlenUTF8()                              */
1156
/************************************************************************/
1157

1158
/**
1159
 * Return the number of UTF-8 characters of a nul-terminated string.
1160
 *
1161
 * This is different from strlen() which returns the number of bytes.
1162
 *
1163
 * @param pszUTF8Str a nul-terminated UTF-8 string
1164
 *
1165
 * @return the number of UTF-8 characters.
1166
 */
1167

1168
int CPLStrlenUTF8(const char *pszUTF8Str)
382,223✔
1169
{
1170
    int nCharacterCount = 0;
382,223✔
1171
    for (int i = 0; pszUTF8Str[i] != '\0'; ++i)
19,409,500✔
1172
    {
1173
        if ((pszUTF8Str[i] & 0xc0) != 0x80)
19,027,300✔
1174
            ++nCharacterCount;
19,027,300✔
1175
    }
1176
    return nCharacterCount;
382,223✔
1177
}
1178

1179
/************************************************************************/
1180
/*                           CPLCanRecode()                             */
1181
/************************************************************************/
1182

1183
/**
1184
 * Checks if it is possible to recode a string from one encoding to another.
1185
 *
1186
 * @param pszTestStr a NULL terminated string.
1187
 * @param pszSrcEncoding the source encoding.
1188
 * @param pszDstEncoding the destination encoding.
1189
 *
1190
 * @return a TRUE if recode is possible.
1191
 *
1192
 * @since GDAL 3.1.0
1193
 */
1194
int CPLCanRecode(const char *pszTestStr, const char *pszSrcEncoding,
5,790✔
1195
                 const char *pszDstEncoding)
1196
{
1197
    CPLClearRecodeWarningFlags();
5,790✔
1198
    CPLErrorReset();
5,790✔
1199

1200
    CPLPushErrorHandler(CPLQuietErrorHandler);
5,790✔
1201
    char *pszRec(CPLRecode(pszTestStr, pszSrcEncoding, pszDstEncoding));
5,790✔
1202
    CPLPopErrorHandler();
5,790✔
1203

1204
    if (pszRec == nullptr)
5,790✔
1205
    {
1206
        return FALSE;
×
1207
    }
1208

1209
    CPLFree(pszRec);
5,790✔
1210

1211
    if (CPLGetLastErrorType() != 0)
5,790✔
1212
    {
1213
        return FALSE;
1✔
1214
    }
1215

1216
    return TRUE;
5,789✔
1217
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc