kraen.hansen_75

Committed 23 Aug 2024 10:17AM UTC coverage: 91.772% (+0.01%) from 91.761%

Build # kraen.hansen_75

Build Type

Pull #7995

Evergreen

Committed by

kraenhansen

Commit Message

Updated bindgen spec and helpers

Pull Request Pull Request #7995: Updated bindgen spec and helpers

Run Details

73344 of 130388 branches covered (56.25%)

139610 of 152127 relevant lines covered (91.77%)

6529327.69 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

83.83

/src/realm/unicode.cpp

/*************************************************************************
 *
 * Copyright 2016 Realm Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **************************************************************************/

#include <realm/unicode.hpp>

#include <algorithm>
#include <clocale>
#include <vector>

#ifdef _WIN32
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#else
#include <ctype.h>
#endif

namespace realm {

// clang-format off
// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
size_t sequence_length(char lead)
{
    // keep 'static' else entire array will be pushed to stack at each call
    const static unsigned char lengths[256] = {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
    };

    return lengths[static_cast<unsigned char>(lead)];
}
// clang-format on

// Check if the next UTF-8 sequence in [begin, end) is identical to
// the one beginning at begin2. If it is, 'begin' is advanced
// accordingly.
bool equal_sequence(const char*& begin, const char* end, const char* begin2)
{
    if (begin[0] != begin2[0])
        return false;

    size_t i = 1;
    if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {
        // All following bytes matching '10xxxxxx' will be considered
        // as part of this character.
        while (begin + i != end) {
            if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)
                break;
            if (begin[i] != begin2[i])
                return false;
            ++i;
        }
    }

    begin += i;
    return true;
}

// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
uint32_t utf8value(const char* character)
{
    const unsigned char* c = reinterpret_cast<const unsigned char*>(character);
    size_t len = sequence_length(c[0]);
    uint32_t res = c[0];

    if (len == 1)
        return res;

    res &= (0x3f >> (len - 1));

    for (size_t i = 1; i < len; i++)
        res = ((res << 6) | (c[i] & 0x3f));

    return res;
}

// Converts UTF-8 source into upper or lower case. This function
// preserves the byte length of each UTF-8 character in following way:
// If an output character differs in size, it is simply substituded by
// the original character. This may of course give wrong search
// results in very special cases. Todo.
util::Optional<std::string> case_map(StringData source, bool upper)
{
    std::string result;
    result.resize(source.size());

#if defined(_WIN32)
    constexpr int tmp_buffer_size = 32;
    const char* begin = source.data();
    const char* end = begin + source.size();
    auto output = result.begin();
    while (begin != end) {
        auto n = end - begin;
        if (n > tmp_buffer_size) {
            // Break the input string into chunks - but don't break in the middle of a multibyte character
            const char* p = begin;
            const char* buffer_end = begin + tmp_buffer_size;
            while (p < buffer_end) {
                size_t len = sequence_length(*p);
                p += len;
                if (p > buffer_end) {
                    p -= len;
                    break;
                }
            }
            n = p - begin;
        }

        wchar_t tmp[tmp_buffer_size];

        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
        if (n2 == 0)
            return util::none;

        if (n2 < tmp_buffer_size)
            tmp[n2] = 0;

        // Note: If tmp[0] == 0, it is because the string contains a
        // null-chacarcter, which is perfectly fine.

        wchar_t mapped_tmp[tmp_buffer_size];
        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
                      tmp_buffer_size, nullptr, nullptr, 0);

        // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
        // to catch invalid UTF-8. Even though the documentation says
        // unambigously that it is supposed to work, it doesn't. When
        // the flag is specified, the function fails with error
        // ERROR_INVALID_FLAGS.
        DWORD flags = 0;
        auto m = static_cast<int>(end - begin);
        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
        if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
            return util::none;

        if (n3 != n) {
            realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
        }

        begin += n;
        output += n;
    }

    return result;
#else
    size_t sz = source.size();
    typedef std::char_traits<char> traits;
    for (size_t i = 0; i < sz; ++i) {
        char c = source[i];
        auto int_val = traits::to_int_type(c);

        auto copy_bytes = [&](size_t n) {
            if (i + n > sz) {
                return false;
            }
            for (size_t j = 1; j < n; j++) {
                result[i++] = c;
                c = source[i];
                if ((c & 0xC0) != 0x80) {
                    return false;
                }
            }
            return true;
        };

        if (int_val < 0x80) {
            // Handle ASCII
            if (upper && (c >= 'a' && c <= 'z')) {
                c -= 0x20;
            }
            else if (!upper && (c >= 'A' && c <= 'Z')) {
                c += 0x20;
            }
        }
        else {
            if ((int_val & 0xE0) == 0xc0) {
                // 2 byte utf-8
                if (i + 2 > sz) {
                    return {};
                }
                c = source[i + 1];
                if ((c & 0xC0) != 0x80) {
                    return {};
                }
                auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;
                // Handle some Latin-1 supplement characters
                if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
                    u -= 0x20;
                }
                else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
                    u += 0x20;
                }

                result[i++] = static_cast<char>((u >> 6) | 0xC0);
                c = static_cast<char>((u & 0x3f) | 0x80);
            }
            else if ((int_val & 0xF0) == 0xE0) {
                // 3 byte utf-8
                if (!copy_bytes(3)) {
                    return {};
                }
            }
            else if ((int_val & 0xF8) == 0xF0) {
                // 4 byte utf-8
                if (!copy_bytes(4)) {
                    return {};
                }
            }
            else {
                return {};
            }
        }
        result[i] = c;
    }
    return result;
#endif
}

std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
{
    return case_map(source, upper).value_or("");
}

// If needle == haystack, return true. NOTE: This function first
// performs a case insensitive *byte* compare instead of one whole
// UTF-8 character at a time. This is very fast, but not enough to
// guarantee that the strings are identical, so we need to finish off
// with a slower but rigorous comparison. The signature is similar in
// spirit to std::equal().
bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
{
    for (size_t i = 0; i != haystack.size(); ++i) {
        char c = haystack[i];
        if (needle_lower[i] != c && needle_upper[i] != c)
            return false;
    }

    const char* begin = haystack.data();
    const char* end = begin + haystack.size();
    const char* i = begin;
    while (i != end) {
        if (!equal_sequence(i, end, needle_lower + (i - begin)) &&
            !equal_sequence(i, end, needle_upper + (i - begin)))
            return false;
    }
    return true;
}


// Test if needle is a substring of haystack. The signature is similar
// in spirit to std::search().
size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
{
    // FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.
    size_t i = 0;
    while (needle_size <= haystack.size() - i) {
        if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {
            return i;
        }
        ++i;
    }
    return haystack.size(); // Not found
}

/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
                  const std::array<uint8_t, 256>& charmap)
{
    if (needle_size == 0)
        return haystack.size() != 0;

    // Prepare vars to avoid lookups in loop
    size_t last_char_pos = needle_size - 1;
    unsigned char lastCharU = needle_upper[last_char_pos];
    unsigned char lastCharL = needle_lower[last_char_pos];

    // Do Boyer-Moore search
    size_t p = last_char_pos;
    while (p < haystack.size()) {
        unsigned char c = haystack.data()[p]; // Get candidate for last char

        if (c == lastCharU || c == lastCharL) {
            StringData candidate = haystack.substr(p - needle_size + 1, needle_size);
            if (equal_case_fold(candidate, needle_upper, needle_lower))
                return true; // text found!
        }

        // If we don't have a match, see how far we can move char_pos
        if (charmap[c] == 0)
            p += needle_size; // char was not present in search string
        else
            p += charmap[c];
    }

    return false;
}

bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
{
    if (text.is_null() || lower.is_null()) {
        return (text.is_null() && lower.is_null());
    }

    return StringData::matchlike_ins(text, lower, upper);
}

bool string_like_ins(StringData text, StringData pattern) noexcept
{
    if (text.is_null() || pattern.is_null()) {
        return (text.is_null() && pattern.is_null());
    }

    std::string upper = case_map(pattern, true, IgnoreErrors);
    std::string lower = case_map(pattern, false, IgnoreErrors);

    return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());
}

} // namespace realm

1	/*************************************************************************
2	*
3	* Copyright 2016 Realm Inc.
4	*
5	* Licensed under the Apache License, Version 2.0 (the "License");
6	* you may not use this file except in compliance with the License.
7	* You may obtain a copy of the License at
8	*
9	* http://www.apache.org/licenses/LICENSE-2.0
10	*
11	* Unless required by applicable law or agreed to in writing, software
12	* distributed under the License is distributed on an "AS IS" BASIS,
13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14	* See the License for the specific language governing permissions and
15	* limitations under the License.
16	*
17	**************************************************************************/
18
19	#include <realm/unicode.hpp>
20
21	#include <algorithm>
22	#include <clocale>
23	#include <vector>
24
25	#ifdef _WIN32
26	#ifndef NOMINMAX
27	#define NOMINMAX
28	#endif
29	#include <windows.h>
30	#else
31	#include <ctype.h>
32	#endif
33
34	namespace realm {
35
36	// clang-format off
37	// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
38	size_t sequence_length(char lead)
39	{	38,308✔
40	// keep 'static' else entire array will be pushed to stack at each call
41	const static unsigned char lengths[256] = {	38,308✔
42	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	38,308✔
43	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	38,308✔
44	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	38,308✔
45	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1	38,308✔
46	};	38,308✔
47
48	return lengths[static_cast<unsigned char>(lead)];	38,308✔
49	}	38,308✔
50	// clang-format on
51
52	// Check if the next UTF-8 sequence in [begin, end) is identical to
53	// the one beginning at begin2. If it is, 'begin' is advanced
54	// accordingly.
55	bool equal_sequence(const char& begin, const char end, const char* begin2)
56	{	18,740✔
57	if (begin[0] != begin2[0])	18,740✔
58	return false;	616✔
59
60	size_t i = 1;	18,124✔
61	if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {	18,124✔
62	// All following bytes matching '10xxxxxx' will be considered
63	// as part of this character.
64	while (begin + i != end) {	72✔
65	if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)	60✔
66	break;	16✔
67	if (begin[i] != begin2[i])	44✔
68	return false;	×
69	++i;	44✔
70	}	44✔
71	}	28✔
72
73	begin += i;	18,124✔
74	return true;	18,124✔
75	}	18,124✔
76
77	// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
78	uint32_t utf8value(const char* character)
79	{	×
80	const unsigned char* c = reinterpret_cast<const unsigned char*>(character);	×
81	size_t len = sequence_length(c[0]);	×
82	uint32_t res = c[0];	×
83
84	if (len == 1)	×
85	return res;	×
86
87	res &= (0x3f >> (len - 1));	×
88
89	for (size_t i = 1; i < len; i++)	×
90	res = ((res << 6) \| (c[i] & 0x3f));	×
91
92	return res;	×
93	}	×
94
95	// Converts UTF-8 source into upper or lower case. This function
96	// preserves the byte length of each UTF-8 character in following way:
97	// If an output character differs in size, it is simply substituded by
98	// the original character. This may of course give wrong search
99	// results in very special cases. Todo.
100	util::Optional<std::string> case_map(StringData source, bool upper)
101	{	154,456✔
102	std::string result;	154,456✔
103	result.resize(source.size());	154,456✔
104
105	#if defined(_WIN32)
106	constexpr int tmp_buffer_size = 32;
107	const char* begin = source.data();
108	const char* end = begin + source.size();
109	auto output = result.begin();
110	while (begin != end) {
111	auto n = end - begin;
112	if (n > tmp_buffer_size) {
113	// Break the input string into chunks - but don't break in the middle of a multibyte character
114	const char* p = begin;
115	const char* buffer_end = begin + tmp_buffer_size;
116	while (p < buffer_end) {
117	size_t len = sequence_length(*p);
118	p += len;
119	if (p > buffer_end) {
120	p -= len;
121	break;
122	}
123	}
124	n = p - begin;
125	}
126
127	wchar_t tmp[tmp_buffer_size];
128
129	int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
130	if (n2 == 0)
131	return util::none;
132
133	if (n2 < tmp_buffer_size)
134	tmp[n2] = 0;
135
136	// Note: If tmp[0] == 0, it is because the string contains a
137	// null-chacarcter, which is perfectly fine.
138
139	wchar_t mapped_tmp[tmp_buffer_size];
140	LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
141	tmp_buffer_size, nullptr, nullptr, 0);
142
143	// FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
144	// to catch invalid UTF-8. Even though the documentation says
145	// unambigously that it is supposed to work, it doesn't. When
146	// the flag is specified, the function fails with error
147	// ERROR_INVALID_FLAGS.
148	DWORD flags = 0;
149	auto m = static_cast<int>(end - begin);
150	int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
151	if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
152	return util::none;
153
154	if (n3 != n) {
155	realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
156	}
157
158	begin += n;
159	output += n;
160	}
161
162	return result;
163	#else
164	size_t sz = source.size();	154,456✔
165	typedef std::char_traits<char> traits;	154,456✔
166	for (size_t i = 0; i < sz; ++i) {	12,547,544✔
167	char c = source[i];	12,393,092✔
168	auto int_val = traits::to_int_type(c);	12,393,092✔
169
170	auto copy_bytes = [&](size_t n) {	12,393,092✔
171	if (i + n > sz) {	128✔
172	return false;	4✔
173	}	4✔
174	for (size_t j = 1; j < n; j++) {	400✔
175	result[i++] = c;	276✔
176	c = source[i];	276✔
177	if ((c & 0xC0) != 0x80) {	276✔
178	return false;	×
179	}	×
180	}	276✔
181	return true;	124✔
182	};	124✔
183
184	if (int_val < 0x80) {	12,393,092✔
185	// Handle ASCII
186	if (upper && (c >= 'a' && c <= 'z')) {	12,392,540✔
187	c -= 0x20;	10,808,664✔
188	}	10,808,664✔
189	else if (!upper && (c >= 'A' && c <= 'Z')) {	1,583,876✔
190	c += 0x20;	540,648✔
191	}	540,648✔
192	}	12,392,540✔
193	else {	552✔
194	if ((int_val & 0xE0) == 0xc0) {	552✔
195	// 2 byte utf-8
196	if (i + 2 > sz) {	424✔
197	return {};	×
198	}	×
199	c = source[i + 1];	424✔
200	if ((c & 0xC0) != 0x80) {	424✔
201	return {};	×
202	}	×
203	auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;	424✔
204	// Handle some Latin-1 supplement characters
205	if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {	424✔
206	u -= 0x20;	180✔
207	}	180✔
208	else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {	244✔
209	u += 0x20;	120✔
210	}	120✔
211
212	result[i++] = static_cast<char>((u >> 6) \| 0xC0);	424✔
213	c = static_cast<char>((u & 0x3f) \| 0x80);	424✔
214	}	424✔
215	else if ((int_val & 0xF0) == 0xE0) {	128✔
216	// 3 byte utf-8
217	if (!copy_bytes(3)) {	96✔
218	return {};	×
219	}	×
220	}	96✔
221	else if ((int_val & 0xF8) == 0xF0) {	32✔
222	// 4 byte utf-8
223	if (!copy_bytes(4)) {	32✔
224	return {};	4✔
225	}	4✔
226	}	32✔
227	else {	×
228	return {};	×
229	}	×
230	}	552✔
231	result[i] = c;	12,393,088✔
232	}	12,393,088✔
233	return result;	154,452✔
234	#endif	154,456✔
235	}	154,456✔
236
237	std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
238	{	82,944✔
239	return case_map(source, upper).value_or("");	82,944✔
240	}	82,944✔
241
242	// If needle == haystack, return true. NOTE: This function first
243	// performs a case insensitive byte compare instead of one whole
244	// UTF-8 character at a time. This is very fast, but not enough to
245	// guarantee that the strings are identical, so we need to finish off
246	// with a slower but rigorous comparison. The signature is similar in
247	// spirit to std::equal().
248	bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
249	{	73,408✔
250	for (size_t i = 0; i != haystack.size(); ++i) {	124,756✔
251	char c = haystack[i];	66,268✔
252	if (needle_lower[i] != c && needle_upper[i] != c)	66,268✔
253	return false;	14,920✔
254	}	66,268✔
255
256	const char* begin = haystack.data();	58,488✔
257	const char* end = begin + haystack.size();	58,488✔
258	const char* i = begin;	58,488✔
259	while (i != end) {	76,612✔
260	if (!equal_sequence(i, end, needle_lower + (i - begin)) &&	18,124✔
261	!equal_sequence(i, end, needle_upper + (i - begin)))	18,124✔
262	return false;	×
263	}	18,124✔
264	return true;	58,488✔
265	}	58,488✔
266
267
268	// Test if needle is a substring of haystack. The signature is similar
269	// in spirit to std::search().
270	size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
271	{	6,212✔
272	// FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.
273	size_t i = 0;	6,212✔
274	while (needle_size <= haystack.size() - i) {	17,052✔
275	if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {	12,596✔
276	return i;	1,756✔
277	}	1,756✔
278	++i;	10,840✔
279	}	10,840✔
280	return haystack.size(); // Not found	4,456✔
281	}	6,212✔
282
283	/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
284	/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
285	/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
286	bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
287	const std::array<uint8_t, 256>& charmap)
288	{	4,948✔
289	if (needle_size == 0)	4,948✔
290	return haystack.size() != 0;	×
291
292	// Prepare vars to avoid lookups in loop
293	size_t last_char_pos = needle_size - 1;	4,948✔
294	unsigned char lastCharU = needle_upper[last_char_pos];	4,948✔
295	unsigned char lastCharL = needle_lower[last_char_pos];	4,948✔
296
297	// Do Boyer-Moore search
298	size_t p = last_char_pos;	4,948✔
299	while (p < haystack.size()) {	9,560✔
300	unsigned char c = haystack.data()[p]; // Get candidate for last char	5,160✔
301
302	if (c == lastCharU \|\| c == lastCharL) {	5,160✔
303	StringData candidate = haystack.substr(p - needle_size + 1, needle_size);	580✔
304	if (equal_case_fold(candidate, needle_upper, needle_lower))	580✔
305	return true; // text found!	548✔
306	}	580✔
307
308	// If we don't have a match, see how far we can move char_pos
309	if (charmap[c] == 0)	4,612✔
310	p += needle_size; // char was not present in search string	4,464✔
311	else	148✔
312	p += charmap[c];	148✔
313	}	4,612✔
314
315	return false;	4,400✔
316	}	4,948✔
317
318	bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
319	{	8,196✔
320	if (text.is_null() \|\| lower.is_null()) {	8,196✔
321	return (text.is_null() && lower.is_null());	×
322	}	×
323
324	return StringData::matchlike_ins(text, lower, upper);	8,196✔
325	}	8,196✔
326
327	bool string_like_ins(StringData text, StringData pattern) noexcept
328	{	148✔
329	if (text.is_null() \|\| pattern.is_null()) {	148✔
330	return (text.is_null() && pattern.is_null());	20✔
331	}	20✔
332
333	std::string upper = case_map(pattern, true, IgnoreErrors);	128✔
334	std::string lower = case_map(pattern, false, IgnoreErrors);	128✔
335
336	return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());	128✔
337	}	148✔
338
339	} // namespace realm

realm / realm-core / kraen.hansen_75

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous