#882

Committed 31 Aug 2023 07:44PM UTC coverage: 41.798% (-44.7%) from 86.546%

Build # #882

Build Type

push

Committed by

Commit Message

Run Details

19442 of 46514 relevant lines covered (41.8%)

126375.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

5.48

/libs/core/program_options/src/utf8_codecvt_facet.cpp

/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
// utf8_codecvt_facet.ipp

//  Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
//  Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
//
//  SPDX-License-Identifier: BSL-1.0
//  Use, modification and distribution is subject to the Boost Software
//  License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
//  http://www.boost.org/LICENSE_1_0.txt)

// Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
// learn how this file should be used.

#include <hpx/program_options/config.hpp>
#include <hpx/program_options/detail/utf8_codecvt_facet.hpp>

#include <cstddef>
#include <cstdint>
#include <cstdlib>    // for multi-byte conversion routines
#include <limits>

// If we don't have wstring, then Unicode support
// is not available anyway, so we don't need to even
// compile this file.

/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
namespace hpx::program_options::detail {

    // implementation for wchar_t

    utf8_codecvt_facet::utf8_codecvt_facet(std::size_t no_locale_manage)
      : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
    {
    }

    utf8_codecvt_facet::~utf8_codecvt_facet() = default;

    // Translate incoming UTF-8 into UCS-4
    std::codecvt_base::result utf8_codecvt_facet::do_in(
        std::mbstate_t& /*state*/, char const* from, char const* from_end,
        char const*& from_next, wchar_t* to, wchar_t* to_end,
        wchar_t*& to_next) const
    {
        // Basic algorithm:  The first octet determines how many
        // octets total make up the UCS-4 character.  The remaining
        // "continuing octets" all begin with "10". To convert, subtract the
        // amount that specifies the number of octets from the first
        // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
        // then mash the whole lot together.  Note that each continuing
        // octet only uses 6 bits as unique values, so only shift by multiples
        // of 6 to combine.
        while (from != from_end && to != to_end)
        {
            // Error checking   on the first octet
            if (invalid_leading_octet(*from))
            {
                from_next = from;
                to_next = to;
                return std::codecvt_base::error;
            }

            // The first octet is   adjusted by a value dependent upon the
            // number   of "continuing octets" encoding the character
            int const cont_octet_count =
                static_cast<int>(get_cont_octet_count(*from));
            wchar_t const octet1_modifier_table[] = {
                0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};

            // The unsigned char conversion is necessary in case char is
            // signed   (I learned this the hard way)
            wchar_t ucs_result = static_cast<unsigned char>(*from++) -
                octet1_modifier_table[cont_octet_count];

            // Invariants   :
            //   1) At the start of the loop,   'i' continuing characters have been
            //    processed
            //   2) *from   points to the next continuing character to be processed.
            int i = 0;
            while (i != cont_octet_count && from != from_end)
            {
                // Error checking on continuing characters
                if (invalid_continuing_octet(*from))
                {
                    from_next = from;
                    to_next = to;
                    return std::codecvt_base::error;
                }

                ucs_result *= (1 << 6);

                // each continuing character has an extra (10xxxxxx)b attached to
                // it that must be removed.
                ucs_result += static_cast<unsigned char>(*from++) - 0x80;
                ++i;
            }

            // If   the buffer ends with an incomplete unicode character...
            if (from == from_end && i != cont_octet_count)
            {
                // rewind "from" to before the current character translation
                from_next = from - (static_cast<std::size_t>(i) + 1);
                to_next = to;
                return std::codecvt_base::partial;
            }
            *to++ = ucs_result;
        }
        from_next = from;
        to_next = to;

        // Were we done converting or did we run out of destination space?
        return from == from_end ? std::codecvt_base::ok :
                                  std::codecvt_base::partial;
    }

    std::codecvt_base::result utf8_codecvt_facet::do_out(
        std::mbstate_t& /*state*/, wchar_t const* from, wchar_t const* from_end,
        wchar_t const*& from_next, char* to, char* to_end, char*& to_next) const
    {
        // RG - consider merging this table with the other one
        wchar_t const octet1_modifier_table[] = {
            0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};

        constexpr wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
        while (from != from_end && to != to_end)
        {
            // Check for invalid UCS-4 character
            if (static_cast<std::uint32_t>(*from) >
                static_cast<std::uint32_t>(max_wchar))
            {
                from_next = from;
                to_next = to;
                return std::codecvt_base::error;
            }

            int const cont_octet_count = get_cont_octet_out_count(*from);

            // RG  - comment this formula better
            int shift_exponent = cont_octet_count * 6;

            // Process the first character
            *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
                static_cast<unsigned char>(*from / (1 << shift_exponent)));

            // Process the continuation characters
            // Invariants: At   the start of the loop:
            //   1) 'i' continuing octets   have been generated
            //   2) '*to'   points to the next location to place an octet
            //   3) shift_exponent is   6 more than needed for the next octet
            int i = 0;
            while (i != cont_octet_count && to != to_end)
            {
                shift_exponent -= 6;
                *to++ = static_cast<char>(
                    0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
                ++i;
            }
            // If   we filled up the out buffer before encoding the character
            if (to == to_end && i != cont_octet_count)
            {
                from_next = from;
                to_next = to - (static_cast<std::size_t>(i) + 1);
                return std::codecvt_base::partial;
            }
            ++from;
        }
        from_next = from;
        to_next = to;

        // Were we done or did we run out of destination space
        return from == from_end ? std::codecvt_base::ok :
                                  std::codecvt_base::partial;
    }

    // How many char objects can I process to get <= max_limit
    // wchar_t objects?
    int utf8_codecvt_facet::do_length(std::mbstate_t&, char const* from,
        char const* from_end, std::size_t max_limit) const noexcept
    {
        // RG - this code is confusing!  I need a better way to express it.
        // and test cases.

        // Invariants:
        // 1) last_octet_count has the size of the last measured character
        // 2) char_count holds the number of characters shown to fit
        // within the bounds so far (no greater than max_limit)
        // 3) from_next points to the octet 'last_octet_count' before the
        // last measured character.
        std::size_t last_octet_count = 0;
        std::size_t char_count = 0;
        char const* from_next = from;

        // Use "<" because the buffer may represent incomplete characters
        while (
            from_next + last_octet_count <= from_end && char_count <= max_limit)
        {
            from_next += last_octet_count;
            last_octet_count =
                static_cast<std::size_t>(get_octet_count(*from_next));
            ++char_count;
        }
        return static_cast<int>(from_next - from);
    }

    unsigned int utf8_codecvt_facet::get_octet_count(
        unsigned char lead_octet) noexcept
    {
        // if the 0-bit (MSB) is 0, then 1 character
        if (lead_octet <= 0x7f)
            return 1;

        // Otherwise the count number of consecutive 1 bits starting at MSB
        //    assert(0xc0 <= lead_octet && lead_octet <= 0xfd);

        if (0xc0 <= lead_octet && lead_octet <= 0xdf)
            return 2;
        else if (0xe0 <= lead_octet && lead_octet <= 0xef)
            return 3;
        else if (0xf0 <= lead_octet && lead_octet <= 0xf7)
            return 4;    // -V112
        else if (0xf8 <= lead_octet && lead_octet <= 0xfb)
            return 5;
        else
            return 6;
    }

    namespace detail {

        template <std::size_t s>
        int get_cont_octet_out_count_impl(wchar_t word) noexcept
        {
            if (word < 0x80)
            {
                return 0;
            }
            if (word < 0x800)
            {
                return 1;
            }
            return 2;
        }

        template <>
        int get_cont_octet_out_count_impl<4>(wchar_t word) noexcept
        {
            if (word < 0x80)
            {
                return 0;
            }
            if (word < 0x800)
            {
                return 1;
            }

            // Note that the following code will generate warnings on some platforms
            // where wchar_t is defined as UCS2.  The warnings are superfluous as the
            // specialization is never instantiated with such compilers, but this
            // can cause problems if warnings are being treated as errors, so we guard
            // against that.  Including <boost/detail/utf8_codecvt_facet.hpp> as we do
            // should be enough to get WCHAR_MAX defined.
            // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
#if defined(_MSC_VER) && _MSC_VER <= 1310    // 7.1 or earlier
            return 2;
#elif WCHAR_MAX > 0x10000

            if (word < 0x10000)
            {
                return 2;
            }
            if (word < 0x200000)
            {
                return 3;
            }
            if (word < 0x4000000)
            {
                return 4;
            }
            return 5;
#else
            return 2;
#endif
        }

    }    // namespace detail

    // How many "continuing octets" will be needed for this word
    // ==   total octets - 1.
    int utf8_codecvt_facet::get_cont_octet_out_count(
        wchar_t word) const noexcept
    {
        return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
    }
}    // namespace hpx::program_options::detail

1	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2	// utf8_codecvt_facet.ipp
3
4	// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5	// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6	//
7	// SPDX-License-Identifier: BSL-1.0
8	// Use, modification and distribution is subject to the Boost Software
9	// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
10	// http://www.boost.org/LICENSE_1_0.txt)
11
12	// Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
13	// learn how this file should be used.
14
15	#include <hpx/program_options/config.hpp>
16	#include <hpx/program_options/detail/utf8_codecvt_facet.hpp>
17
18	#include <cstddef>
19	#include <cstdint>
20	#include <cstdlib> // for multi-byte conversion routines
21	#include <limits>
22
23	// If we don't have wstring, then Unicode support
24	// is not available anyway, so we don't need to even
25	// compile this file.
26
27	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
28	namespace hpx::program_options::detail {
29
30	// implementation for wchar_t
31		64✔
32	utf8_codecvt_facet::utf8_codecvt_facet(std::size_t no_locale_manage)	64✔
33	: std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
34	{	64✔
35	}
36		64✔
37	utf8_codecvt_facet::~utf8_codecvt_facet() = default;
38
39	// Translate incoming UTF-8 into UCS-4	×
40	std::codecvt_base::result utf8_codecvt_facet::do_in(
41	std::mbstate_t& /state/, char const* from, char const* from_end,
42	char const& from_next, wchar_t to, wchar_t* to_end,
43	wchar_t*& to_next) const
44	{
45	// Basic algorithm: The first octet determines how many
46	// octets total make up the UCS-4 character. The remaining
47	// "continuing octets" all begin with "10". To convert, subtract the
48	// amount that specifies the number of octets from the first
49	// octet. Subtract 0x80 (1000 0000) from each continuing octet,
50	// then mash the whole lot together. Note that each continuing
51	// octet only uses 6 bits as unique values, so only shift by multiples
52	// of 6 to combine.	×
53	while (from != from_end && to != to_end)
54	{
55	// Error checking on the first octet	×
56	if (invalid_leading_octet(*from))
57	{	×
58	from_next = from;	×
59	to_next = to;	×
60	return std::codecvt_base::error;
61	}
62
63	// The first octet is adjusted by a value dependent upon the
64	// number of "continuing octets" encoding the character
65	int const cont_octet_count =	×
66	static_cast<int>(get_cont_octet_count(*from));	×
67	wchar_t const octet1_modifier_table[] = {
68	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
69
70	// The unsigned char conversion is necessary in case char is
71	// signed (I learned this the hard way)	×
72	wchar_t ucs_result = static_cast<unsigned char>(*from++) -	×
73	octet1_modifier_table[cont_octet_count];
74
75	// Invariants :
76	// 1) At the start of the loop, 'i' continuing characters have been
77	// processed
78	// 2) *from points to the next continuing character to be processed.
79	int i = 0;	×
80	while (i != cont_octet_count && from != from_end)
81	{
82	// Error checking on continuing characters	×
83	if (invalid_continuing_octet(*from))
84	{	×
85	from_next = from;	×
86	to_next = to;	×
87	return std::codecvt_base::error;
88	}
89		×
90	ucs_result *= (1 << 6);
91
92	// each continuing character has an extra (10xxxxxx)b attached to
93	// it that must be removed.	×
94	ucs_result += static_cast<unsigned char>(*from++) - 0x80;	×
95	++i;
96	}
97
98	// If the buffer ends with an incomplete unicode character...	×
99	if (from == from_end && i != cont_octet_count)
100	{
101	// rewind "from" to before the current character translation	×
102	from_next = from - (static_cast<std::size_t>(i) + 1);	×
103	to_next = to;	×
104	return std::codecvt_base::partial;
105	}	×
106	*to++ = ucs_result;
107	}	×
108	from_next = from;	×
109	to_next = to;
110
111	// Were we done converting or did we run out of destination space?
112	return from == from_end ? std::codecvt_base::ok :	×
113	std::codecvt_base::partial;
114	}
115		×
116	std::codecvt_base::result utf8_codecvt_facet::do_out(
117	std::mbstate_t& /state/, wchar_t const* from, wchar_t const* from_end,
118	wchar_t const& from_next, char to, char* to_end, char*& to_next) const
119	{
120	// RG - consider merging this table with the other one	×
121	wchar_t const octet1_modifier_table[] = {
122	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
123
124	constexpr wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();	×
125	while (from != from_end && to != to_end)
126	{
127	// Check for invalid UCS-4 character
128	if (static_cast<std::uint32_t>(*from) >
129	static_cast<std::uint32_t>(max_wchar))
130	{
131	from_next = from;
132	to_next = to;
133	return std::codecvt_base::error;
134	}	×
135
136	int const cont_octet_count = get_cont_octet_out_count(*from);
137		×
138	// RG - comment this formula better
139	int shift_exponent = cont_octet_count * 6;
140		×
141	// Process the first character	×
142	*to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
143	static_cast<unsigned char>(*from / (1 << shift_exponent)));
144
145	// Process the continuation characters
146	// Invariants: At the start of the loop:
147	// 1) 'i' continuing octets have been generated
148	// 2) '*to' points to the next location to place an octet
149	// 3) shift_exponent is 6 more than needed for the next octet	×
150	int i = 0;
151	while (i != cont_octet_count && to != to_end)	×
152	{	×
153	shift_exponent -= 6;	×
154	*to++ = static_cast<char>(	×
155	0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
156	++i;
157	}	×
158	// If we filled up the out buffer before encoding the character
159	if (to == to_end && i != cont_octet_count)	×
160	{	×
161	from_next = from;	×
162	to_next = to - (static_cast<std::size_t>(i) + 1);
163	return std::codecvt_base::partial;	×
164	}
165	++from;	×
166	}	×
167	from_next = from;
168	to_next = to;
169
170	// Were we done or did we run out of destination space	×
171	return from == from_end ? std::codecvt_base::ok :
172	std::codecvt_base::partial;
173	}
174
175	// How many char objects can I process to get <= max_limit	×
176	// wchar_t objects?
177	int utf8_codecvt_facet::do_length(std::mbstate_t&, char const* from,
178	char const* from_end, std::size_t max_limit) const noexcept
179	{
180	// RG - this code is confusing! I need a better way to express it.
181	// and test cases.
182
183	// Invariants:
184	// 1) last_octet_count has the size of the last measured character
185	// 2) char_count holds the number of characters shown to fit
186	// within the bounds so far (no greater than max_limit)
187	// 3) from_next points to the octet 'last_octet_count' before the
188	// last measured character.
189	std::size_t last_octet_count = 0;
190	std::size_t char_count = 0;
191	char const* from_next = from;
192		×
193	// Use "<" because the buffer may represent incomplete characters	×
194	while (
195	from_next + last_octet_count <= from_end && char_count <= max_limit)
196	{	×
197	from_next += last_octet_count;	×
198	last_octet_count =	×
199	static_cast<std::size_t>(get_octet_count(*from_next));
200	++char_count;	×
201	}
202	return static_cast<int>(from_next - from);
203	}	×
204
205	unsigned int utf8_codecvt_facet::get_octet_count(
206	unsigned char lead_octet) noexcept
207	{	×
208	// if the 0-bit (MSB) is 0, then 1 character
209	if (lead_octet <= 0x7f)
210	return 1;
211
212	// Otherwise the count number of consecutive 1 bits starting at MSB
213	// assert(0xc0 <= lead_octet && lead_octet <= 0xfd);	×
214
215	if (0xc0 <= lead_octet && lead_octet <= 0xdf)	×
216	return 2;
217	else if (0xe0 <= lead_octet && lead_octet <= 0xef)	×
218	return 3;
219	else if (0xf0 <= lead_octet && lead_octet <= 0xf7)	×
220	return 4; // -V112
221	else if (0xf8 <= lead_octet && lead_octet <= 0xfb)
222	return 5;	×
223	else
224	return 6;
225	}
226
227	namespace detail {
228
229	template <std::size_t s>
230	int get_cont_octet_out_count_impl(wchar_t word) noexcept
231	{
232	if (word < 0x80)
233	{
234	return 0;
235	}
236	if (word < 0x800)
237	{
238	return 1;
239	}
240	return 2;
241	}
242		×
243	template <>
244	int get_cont_octet_out_count_impl<4>(wchar_t word) noexcept	×
245	{
246	if (word < 0x80)
247	{
248	return 0;	×
249	}
250	if (word < 0x800)
251	{
252	return 1;
253	}
254
255	// Note that the following code will generate warnings on some platforms
256	// where wchar_t is defined as UCS2. The warnings are superfluous as the
257	// specialization is never instantiated with such compilers, but this
258	// can cause problems if warnings are being treated as errors, so we guard
259	// against that. Including <boost/detail/utf8_codecvt_facet.hpp> as we do
260	// should be enough to get WCHAR_MAX defined.
261	// cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
262	#if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier
263	return 2;
264	#elif WCHAR_MAX > 0x10000	×
265
266	if (word < 0x10000)
267	{
268	return 2;	×
269	}
270	if (word < 0x200000)
271	{
272	return 3;	×
273	}
274	if (word < 0x4000000)	×
275	{
276	return 4;
277	}
278	return 5;
279	#else
280	return 2;
281	#endif
282	}
283
284	} // namespace detail
285
286	// How many "continuing octets" will be needed for this word	×
287	// == total octets - 1.
288	int utf8_codecvt_facet::get_cont_octet_out_count(
289	wchar_t word) const noexcept	×
290	{
291	return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
292	}
293	} // namespace hpx::program_options::detail

STEllAR-GROUP / hpx / #882

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous