• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

STEllAR-GROUP / hpx / #882

31 Aug 2023 07:44PM UTC coverage: 41.798% (-44.7%) from 86.546%
#882

push

19442 of 46514 relevant lines covered (41.8%)

126375.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

5.48
/libs/core/program_options/src/utf8_codecvt_facet.cpp
1
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2
// utf8_codecvt_facet.ipp
3

4
//  Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5
//  Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6
//
7
//  SPDX-License-Identifier: BSL-1.0
8
//  Use, modification and distribution is subject to the Boost Software
9
//  License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
10
//  http://www.boost.org/LICENSE_1_0.txt)
11

12
// Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
13
// learn how this file should be used.
14

15
#include <hpx/program_options/config.hpp>
16
#include <hpx/program_options/detail/utf8_codecvt_facet.hpp>
17

18
#include <cstddef>
19
#include <cstdint>
20
#include <cstdlib>    // for multi-byte conversion routines
21
#include <limits>
22

23
// If we don't have wstring, then Unicode support
24
// is not available anyway, so we don't need to even
25
// compile this file.
26

27
/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
28
namespace hpx::program_options::detail {
29

30
    // implementation for wchar_t
31

64✔
32
    utf8_codecvt_facet::utf8_codecvt_facet(std::size_t no_locale_manage)
64✔
33
      : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
34
    {
64✔
35
    }
36

64✔
37
    utf8_codecvt_facet::~utf8_codecvt_facet() = default;
38

39
    // Translate incoming UTF-8 into UCS-4
×
40
    std::codecvt_base::result utf8_codecvt_facet::do_in(
41
        std::mbstate_t& /*state*/, char const* from, char const* from_end,
42
        char const*& from_next, wchar_t* to, wchar_t* to_end,
43
        wchar_t*& to_next) const
44
    {
45
        // Basic algorithm:  The first octet determines how many
46
        // octets total make up the UCS-4 character.  The remaining
47
        // "continuing octets" all begin with "10". To convert, subtract the
48
        // amount that specifies the number of octets from the first
49
        // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
50
        // then mash the whole lot together.  Note that each continuing
51
        // octet only uses 6 bits as unique values, so only shift by multiples
52
        // of 6 to combine.
×
53
        while (from != from_end && to != to_end)
54
        {
55
            // Error checking   on the first octet
×
56
            if (invalid_leading_octet(*from))
57
            {
×
58
                from_next = from;
×
59
                to_next = to;
×
60
                return std::codecvt_base::error;
61
            }
62

63
            // The first octet is   adjusted by a value dependent upon the
64
            // number   of "continuing octets" encoding the character
65
            int const cont_octet_count =
×
66
                static_cast<int>(get_cont_octet_count(*from));
×
67
            wchar_t const octet1_modifier_table[] = {
68
                0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
69

70
            // The unsigned char conversion is necessary in case char is
71
            // signed   (I learned this the hard way)
×
72
            wchar_t ucs_result = static_cast<unsigned char>(*from++) -
×
73
                octet1_modifier_table[cont_octet_count];
74

75
            // Invariants   :
76
            //   1) At the start of the loop,   'i' continuing characters have been
77
            //    processed
78
            //   2) *from   points to the next continuing character to be processed.
79
            int i = 0;
×
80
            while (i != cont_octet_count && from != from_end)
81
            {
82
                // Error checking on continuing characters
×
83
                if (invalid_continuing_octet(*from))
84
                {
×
85
                    from_next = from;
×
86
                    to_next = to;
×
87
                    return std::codecvt_base::error;
88
                }
89

×
90
                ucs_result *= (1 << 6);
91

92
                // each continuing character has an extra (10xxxxxx)b attached to
93
                // it that must be removed.
×
94
                ucs_result += static_cast<unsigned char>(*from++) - 0x80;
×
95
                ++i;
96
            }
97

98
            // If   the buffer ends with an incomplete unicode character...
×
99
            if (from == from_end && i != cont_octet_count)
100
            {
101
                // rewind "from" to before the current character translation
×
102
                from_next = from - (static_cast<std::size_t>(i) + 1);
×
103
                to_next = to;
×
104
                return std::codecvt_base::partial;
105
            }
×
106
            *to++ = ucs_result;
107
        }
×
108
        from_next = from;
×
109
        to_next = to;
110

111
        // Were we done converting or did we run out of destination space?
112
        return from == from_end ? std::codecvt_base::ok :
×
113
                                  std::codecvt_base::partial;
114
    }
115

×
116
    std::codecvt_base::result utf8_codecvt_facet::do_out(
117
        std::mbstate_t& /*state*/, wchar_t const* from, wchar_t const* from_end,
118
        wchar_t const*& from_next, char* to, char* to_end, char*& to_next) const
119
    {
120
        // RG - consider merging this table with the other one
×
121
        wchar_t const octet1_modifier_table[] = {
122
            0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
123

124
        constexpr wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
×
125
        while (from != from_end && to != to_end)
126
        {
127
            // Check for invalid UCS-4 character
128
            if (static_cast<std::uint32_t>(*from) >
129
                static_cast<std::uint32_t>(max_wchar))
130
            {
131
                from_next = from;
132
                to_next = to;
133
                return std::codecvt_base::error;
134
            }
×
135

136
            int const cont_octet_count = get_cont_octet_out_count(*from);
137

×
138
            // RG  - comment this formula better
139
            int shift_exponent = cont_octet_count * 6;
140

×
141
            // Process the first character
×
142
            *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
143
                static_cast<unsigned char>(*from / (1 << shift_exponent)));
144

145
            // Process the continuation characters
146
            // Invariants: At   the start of the loop:
147
            //   1) 'i' continuing octets   have been generated
148
            //   2) '*to'   points to the next location to place an octet
149
            //   3) shift_exponent is   6 more than needed for the next octet
×
150
            int i = 0;
151
            while (i != cont_octet_count && to != to_end)
×
152
            {
×
153
                shift_exponent -= 6;
×
154
                *to++ = static_cast<char>(
×
155
                    0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
156
                ++i;
157
            }
×
158
            // If   we filled up the out buffer before encoding the character
159
            if (to == to_end && i != cont_octet_count)
×
160
            {
×
161
                from_next = from;
×
162
                to_next = to - (static_cast<std::size_t>(i) + 1);
163
                return std::codecvt_base::partial;
×
164
            }
165
            ++from;
×
166
        }
×
167
        from_next = from;
168
        to_next = to;
169

170
        // Were we done or did we run out of destination space
×
171
        return from == from_end ? std::codecvt_base::ok :
172
                                  std::codecvt_base::partial;
173
    }
174

175
    // How many char objects can I process to get <= max_limit
×
176
    // wchar_t objects?
177
    int utf8_codecvt_facet::do_length(std::mbstate_t&, char const* from,
178
        char const* from_end, std::size_t max_limit) const noexcept
179
    {
180
        // RG - this code is confusing!  I need a better way to express it.
181
        // and test cases.
182

183
        // Invariants:
184
        // 1) last_octet_count has the size of the last measured character
185
        // 2) char_count holds the number of characters shown to fit
186
        // within the bounds so far (no greater than max_limit)
187
        // 3) from_next points to the octet 'last_octet_count' before the
188
        // last measured character.
189
        std::size_t last_octet_count = 0;
190
        std::size_t char_count = 0;
191
        char const* from_next = from;
192

×
193
        // Use "<" because the buffer may represent incomplete characters
×
194
        while (
195
            from_next + last_octet_count <= from_end && char_count <= max_limit)
196
        {
×
197
            from_next += last_octet_count;
×
198
            last_octet_count =
×
199
                static_cast<std::size_t>(get_octet_count(*from_next));
200
            ++char_count;
×
201
        }
202
        return static_cast<int>(from_next - from);
203
    }
×
204

205
    unsigned int utf8_codecvt_facet::get_octet_count(
206
        unsigned char lead_octet) noexcept
207
    {
×
208
        // if the 0-bit (MSB) is 0, then 1 character
209
        if (lead_octet <= 0x7f)
210
            return 1;
211

212
        // Otherwise the count number of consecutive 1 bits starting at MSB
213
        //    assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
×
214

215
        if (0xc0 <= lead_octet && lead_octet <= 0xdf)
×
216
            return 2;
217
        else if (0xe0 <= lead_octet && lead_octet <= 0xef)
×
218
            return 3;
219
        else if (0xf0 <= lead_octet && lead_octet <= 0xf7)
×
220
            return 4;    // -V112
221
        else if (0xf8 <= lead_octet && lead_octet <= 0xfb)
222
            return 5;
×
223
        else
224
            return 6;
225
    }
226

227
    namespace detail {
228

229
        template <std::size_t s>
230
        int get_cont_octet_out_count_impl(wchar_t word) noexcept
231
        {
232
            if (word < 0x80)
233
            {
234
                return 0;
235
            }
236
            if (word < 0x800)
237
            {
238
                return 1;
239
            }
240
            return 2;
241
        }
242

×
243
        template <>
244
        int get_cont_octet_out_count_impl<4>(wchar_t word) noexcept
×
245
        {
246
            if (word < 0x80)
247
            {
248
                return 0;
×
249
            }
250
            if (word < 0x800)
251
            {
252
                return 1;
253
            }
254

255
            // Note that the following code will generate warnings on some platforms
256
            // where wchar_t is defined as UCS2.  The warnings are superfluous as the
257
            // specialization is never instantiated with such compilers, but this
258
            // can cause problems if warnings are being treated as errors, so we guard
259
            // against that.  Including <boost/detail/utf8_codecvt_facet.hpp> as we do
260
            // should be enough to get WCHAR_MAX defined.
261
            // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
262
#if defined(_MSC_VER) && _MSC_VER <= 1310    // 7.1 or earlier
263
            return 2;
264
#elif WCHAR_MAX > 0x10000
×
265

266
            if (word < 0x10000)
267
            {
268
                return 2;
×
269
            }
270
            if (word < 0x200000)
271
            {
272
                return 3;
×
273
            }
274
            if (word < 0x4000000)
×
275
            {
276
                return 4;
277
            }
278
            return 5;
279
#else
280
            return 2;
281
#endif
282
        }
283

284
    }    // namespace detail
285

286
    // How many "continuing octets" will be needed for this word
×
287
    // ==   total octets - 1.
288
    int utf8_codecvt_facet::get_cont_octet_out_count(
289
        wchar_t word) const noexcept
×
290
    {
291
        return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
292
    }
293
}    // namespace hpx::program_options::detail
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc