james.stone_381

Committed 25 Sep 2023 06:35PM UTC coverage: 90.919% (+0.03%) from 90.892%

Build # james.stone_381

Build Type

Pull #6670

Evergreen

Committed by

ironage

Commit Message

optimize: only compare strings once

Pull Request Pull Request #6670: Sorting stage 3

Run Details

97114 of 177952 branches covered (0.0%)

879 of 887 new or added lines in 12 files covered. (99.1%)

105 existing lines in 17 files now uncovered.

236103 of 259684 relevant lines covered (90.92%)

7062315.99 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.64

/src/realm/unicode.cpp

/*************************************************************************
 *
 * Copyright 2016 Realm Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **************************************************************************/

#include <realm/unicode.hpp>

#include <algorithm>
#include <clocale>
#include <vector>

#ifdef _WIN32
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#else
#include <ctype.h>
#endif

namespace realm {

// clang-format off
// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
size_t sequence_length(char lead)
{
    // keep 'static' else entire array will be pushed to stack at each call
    const static unsigned char lengths[256] = {
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
    };

    return lengths[static_cast<unsigned char>(lead)];
}
// clang-format on

// Check if the next UTF-8 sequence in [begin, end) is identical to
// the one beginning at begin2. If it is, 'begin' is advanced
// accordingly.
bool equal_sequence(const char*& begin, const char* end, const char* begin2)
{
    if (begin[0] != begin2[0])
        return false;

    size_t i = 1;
    if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {
        // All following bytes matching '10xxxxxx' will be considered
        // as part of this character.
        while (begin + i != end) {
            if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)
                break;
            if (begin[i] != begin2[i])
                return false;
            ++i;
        }
    }

    begin += i;
    return true;
}

// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
uint32_t utf8value(const char* character)
{
    const unsigned char* c = reinterpret_cast<const unsigned char*>(character);
    size_t len = sequence_length(c[0]);
    uint32_t res = c[0];

    if (len == 1)
        return res;

    res &= (0x3f >> (len - 1));

    for (size_t i = 1; i < len; i++)
        res = ((res << 6) | (c[i] & 0x3f));

    return res;
}

// Converts UTF-8 source into upper or lower case. This function
// preserves the byte length of each UTF-8 character in following way:
// If an output character differs in size, it is simply substituded by
// the original character. This may of course give wrong search
// results in very special cases. Todo.
util::Optional<std::string> case_map(StringData source, bool upper)
{
    std::string result;
    result.resize(source.size());

#if defined(_WIN32)
    constexpr int tmp_buffer_size = 32;
    const char* begin = source.data();
    const char* end = begin + source.size();
    auto output = result.begin();
    while (begin != end) {
        auto n = end - begin;
        if (n > tmp_buffer_size) {
            // Break the input string into chunks - but don't break in the middle of a multibyte character
            const char* p = begin;
            const char* buffer_end = begin + tmp_buffer_size;
            while (p < buffer_end) {
                size_t len = sequence_length(*p);
                p += len;
                if (p > buffer_end) {
                    p -= len;
                    break;
                }
            }
            n = p - begin;
        }

        wchar_t tmp[tmp_buffer_size];

        int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
        if (n2 == 0)
            return util::none;

        if (n2 < tmp_buffer_size)
            tmp[n2] = 0;

        // Note: If tmp[0] == 0, it is because the string contains a
        // null-chacarcter, which is perfectly fine.

        wchar_t mapped_tmp[tmp_buffer_size];
        LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
                      tmp_buffer_size, nullptr, nullptr, 0);

        // FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
        // to catch invalid UTF-8. Even though the documentation says
        // unambigously that it is supposed to work, it doesn't. When
        // the flag is specified, the function fails with error
        // ERROR_INVALID_FLAGS.
        DWORD flags = 0;
        auto m = static_cast<int>(end - begin);
        int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
        if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
            return util::none;

        if (n3 != n) {
            realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
        }

        begin += n;
        output += n;
    }

    return result;
#else
    size_t sz = source.size();
    typedef std::char_traits<char> traits;
    for (size_t i = 0; i < sz; ++i) {
        char c = source[i];
        auto int_val = traits::to_int_type(c);

        auto copy_bytes = [&](size_t n) {
            if (i + n > sz) {
                return false;
            }
            for (size_t j = 1; j < n; j++) {
                result[i++] = c;
                c = source[i];
                if ((c & 0xC0) != 0x80) {
                    return false;
                }
            }
            return true;
        };

        if (int_val < 0x80) {
            // Handle ASCII
            if (upper && (c >= 'a' && c <= 'z')) {
                c -= 0x20;
            }
            else if (!upper && (c >= 'A' && c <= 'Z')) {
                c += 0x20;
            }
        }
        else {
            if ((int_val & 0xE0) == 0xc0) {
                // 2 byte utf-8
                if (i + 2 > sz) {
                    return {};
                }
                c = source[i + 1];
                if ((c & 0xC0) != 0x80) {
                    return {};
                }
                auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;
                // Handle some Latin-1 supplement characters
                if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {
                    u -= 0x20;
                }
                else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {
                    u += 0x20;
                }

                result[i++] = static_cast<char>((u >> 6) | 0xC0);
                c = static_cast<char>((u & 0x3f) | 0x80);
            }
            else if ((int_val & 0xF0) == 0xE0) {
                // 3 byte utf-8
                if (!copy_bytes(3)) {
                    return {};
                }
            }
            else if ((int_val & 0xF8) == 0xF0) {
                // 4 byte utf-8
                if (!copy_bytes(4)) {
                    return {};
                }
            }
            else {
                return {};
            }
        }
        result[i] = c;
    }
    return result;
#endif
}

std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
{
    return case_map(source, upper).value_or("");
}

// If needle == haystack, return true. NOTE: This function first
// performs a case insensitive *byte* compare instead of one whole
// UTF-8 character at a time. This is very fast, but not enough to
// guarantee that the strings are identical, so we need to finish off
// with a slower but rigorous comparison. The signature is similar in
// spirit to std::equal().
bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
{
    for (size_t i = 0; i != haystack.size(); ++i) {
        char c = haystack[i];
        if (needle_lower[i] != c && needle_upper[i] != c)
            return false;
    }

    const char* begin = haystack.data();
    const char* end = begin + haystack.size();
    const char* i = begin;
    while (i != end) {
        if (!equal_sequence(i, end, needle_lower + (i - begin)) &&
            !equal_sequence(i, end, needle_upper + (i - begin)))
            return false;
    }
    return true;
}


// Test if needle is a substring of haystack. The signature is similar
// in spirit to std::search().
size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
{
    // FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.
    size_t i = 0;
    while (needle_size <= haystack.size() - i) {
        if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {
            return i;
        }
        ++i;
    }
    return haystack.size(); // Not found
}

/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
                  const std::array<uint8_t, 256>& charmap)
{
    if (needle_size == 0)
        return haystack.size() != 0;

    // Prepare vars to avoid lookups in loop
    size_t last_char_pos = needle_size - 1;
    unsigned char lastCharU = needle_upper[last_char_pos];
    unsigned char lastCharL = needle_lower[last_char_pos];

    // Do Boyer-Moore search
    size_t p = last_char_pos;
    while (p < haystack.size()) {
        unsigned char c = haystack.data()[p]; // Get candidate for last char

        if (c == lastCharU || c == lastCharL) {
            StringData candidate = haystack.substr(p - needle_size + 1, needle_size);
            if (equal_case_fold(candidate, needle_upper, needle_lower))
                return true; // text found!
        }

        // If we don't have a match, see how far we can move char_pos
        if (charmap[c] == 0)
            p += needle_size; // char was not present in search string
        else
            p += charmap[c];
    }

    return false;
}

bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
{
    if (text.is_null() || lower.is_null()) {
        return (text.is_null() && lower.is_null());
    }

    return StringData::matchlike_ins(text, lower, upper);
}

bool string_like_ins(StringData text, StringData pattern) noexcept
{
    if (text.is_null() || pattern.is_null()) {
        return (text.is_null() && pattern.is_null());
    }

    std::string upper = case_map(pattern, true, IgnoreErrors);
    std::string lower = case_map(pattern, false, IgnoreErrors);

    return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());
}

} // namespace realm

1	/*************************************************************************
2	*
3	* Copyright 2016 Realm Inc.
4	*
5	* Licensed under the Apache License, Version 2.0 (the "License");
6	* you may not use this file except in compliance with the License.
7	* You may obtain a copy of the License at
8	*
9	* http://www.apache.org/licenses/LICENSE-2.0
10	*
11	* Unless required by applicable law or agreed to in writing, software
12	* distributed under the License is distributed on an "AS IS" BASIS,
13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14	* See the License for the specific language governing permissions and
15	* limitations under the License.
16	*
17	**************************************************************************/
18
19	#include <realm/unicode.hpp>
20
21	#include <algorithm>
22	#include <clocale>
23	#include <vector>
24
25	#ifdef _WIN32
26	#ifndef NOMINMAX
27	#define NOMINMAX
28	#endif
29	#include <windows.h>
30	#else
31	#include <ctype.h>
32	#endif
33
34	namespace realm {
35
36	// clang-format off
37	// Returns the number of bytes in a UTF-8 sequence whose leading byte is as specified.
38	size_t sequence_length(char lead)
39	{	2,510,778✔
40	// keep 'static' else entire array will be pushed to stack at each call	1,238,475✔
41	const static unsigned char lengths[256] = {	2,510,778✔
42	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	2,510,778✔
43	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	2,510,778✔
44	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	2,510,778✔
45	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1	2,510,778✔
46	};	2,510,778✔
47		1,238,475✔
48	return lengths[static_cast<unsigned char>(lead)];	2,510,778✔
49	}	2,510,778✔
50	// clang-format on
51
52	// Check if the next UTF-8 sequence in [begin, end) is identical to
53	// the one beginning at begin2. If it is, 'begin' is advanced
54	// accordingly.
55	bool equal_sequence(const char& begin, const char end, const char* begin2)
56	{	22,962✔
57	if (begin[0] != begin2[0])	22,962✔
58	return false;	924✔
59		11,019✔
60	size_t i = 1;	22,038✔
61	if (static_cast<int>(std::char_traits<char>::to_int_type(begin[0])) & 0x80) {	22,038✔
62	// All following bytes matching '10xxxxxx' will be considered	21✔
63	// as part of this character.	21✔
64	while (begin + i != end) {	108✔
65	if ((static_cast<int>(std::char_traits<char>::to_int_type(begin[i])) & (0x80 + 0x40)) != 0x80)	90✔
66	break;	24✔
67	if (begin[i] != begin2[i])	66✔
68	return false;	×
69	++i;	66✔
70	}	66✔
71	}	42✔
72		11,019✔
73	begin += i;	22,038✔
74	return true;	22,038✔
75	}	22,038✔
76
77	// Translate from utf8 char to unicode. No check for invalid utf8; may read out of bounds! Caller must check.
78	uint32_t utf8value(const char* character)
UNCOV 79	{	×
UNCOV 80	const unsigned char* c = reinterpret_cast<const unsigned char*>(character);	×
UNCOV 81	size_t len = sequence_length(c[0]);	×
UNCOV 82	uint32_t res = c[0];	×
83
UNCOV 84	if (len == 1)	×
UNCOV 85	return res;	×
86
UNCOV 87	res &= (0x3f >> (len - 1));	×
88
UNCOV 89	for (size_t i = 1; i < len; i++)	×
UNCOV 90	res = ((res << 6) \| (c[i] & 0x3f));	×
91
UNCOV 92	return res;	×
UNCOV 93	}	×
94
95	// Converts UTF-8 source into upper or lower case. This function
96	// preserves the byte length of each UTF-8 character in following way:
97	// If an output character differs in size, it is simply substituded by
98	// the original character. This may of course give wrong search
99	// results in very special cases. Todo.
100	util::Optional<std::string> case_map(StringData source, bool upper)
101	{	218,580✔
102	std::string result;	218,580✔
103	result.resize(source.size());	218,580✔
104		109,290✔
105	#if defined(_WIN32)
106	constexpr int tmp_buffer_size = 32;
107	const char* begin = source.data();
108	const char* end = begin + source.size();
109	auto output = result.begin();
110	while (begin != end) {
111	auto n = end - begin;
112	if (n > tmp_buffer_size) {
113	// Break the input string into chunks - but don't break in the middle of a multibyte character
114	const char* p = begin;
115	const char* buffer_end = begin + tmp_buffer_size;
116	while (p < buffer_end) {
117	size_t len = sequence_length(*p);
118	p += len;
119	if (p > buffer_end) {
120	p -= len;
121	break;
122	}
123	}
124	n = p - begin;
125	}
126
127	wchar_t tmp[tmp_buffer_size];
128
129	int n2 = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, begin, int(n), tmp, tmp_buffer_size);
130	if (n2 == 0)
131	return util::none;
132
133	if (n2 < tmp_buffer_size)
134	tmp[n2] = 0;
135
136	// Note: If tmp[0] == 0, it is because the string contains a
137	// null-chacarcter, which is perfectly fine.
138
139	wchar_t mapped_tmp[tmp_buffer_size];
140	LCMapStringEx(LOCALE_NAME_INVARIANT, upper ? LCMAP_UPPERCASE : LCMAP_LOWERCASE, tmp, n2, mapped_tmp,
141	tmp_buffer_size, nullptr, nullptr, 0);
142
143	// FIXME: The intention is to use flag 'WC_ERR_INVALID_CHARS'
144	// to catch invalid UTF-8. Even though the documentation says
145	// unambigously that it is supposed to work, it doesn't. When
146	// the flag is specified, the function fails with error
147	// ERROR_INVALID_FLAGS.
148	DWORD flags = 0;
149	auto m = static_cast<int>(end - begin);
150	int n3 = WideCharToMultiByte(CP_UTF8, flags, mapped_tmp, n2, &*output, m, 0, 0);
151	if (n3 == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
152	return util::none;
153
154	if (n3 != n) {
155	realm::safe_copy_n(begin, n, output); // Cannot handle different size, copy source
156	}
157
158	begin += n;
159	output += n;
160	}
161
162	return result;
163	#else
164	size_t sz = source.size();	218,580✔
165	typedef std::char_traits<char> traits;	218,580✔
166	for (size_t i = 0; i < sz; ++i) {	18,714,573✔
167	char c = source[i];	18,495,999✔
168	auto int_val = traits::to_int_type(c);	18,495,999✔
169		9,247,998✔
170	auto copy_bytes = [&](size_t n) {	9,248,094✔
171	if (i + n > sz) {	192✔
172	return false;	6✔
173	}	6✔
174	for (size_t j = 1; j < n; j++) {	600✔
175	result[i++] = c;	414✔
176	c = source[i];	414✔
177	if ((c & 0xC0) != 0x80) {	414✔
178	return false;	×
179	}	×
180	}	414✔
181	return true;	186✔
182	};	186✔
183		9,247,998✔
184	if (int_val < 0x80) {	18,495,999✔
185	// Handle ASCII	9,247,584✔
186	if (upper && (c >= 'a' && c <= 'z')) {	18,495,171✔
187	c -= 0x20;	16,181,580✔
188	}	16,181,580✔
189	else if (!upper && (c >= 'A' && c <= 'Z')) {	2,313,591✔
190	c += 0x20;	807,156✔
191	}	807,156✔
192	}	18,495,171✔
193	else {	828✔
194	if ((int_val & 0xE0) == 0xc0) {	828✔
195	// 2 byte utf-8	318✔
196	if (i + 2 > sz) {	636✔
197	return {};	×
198	}	×
199	c = source[i + 1];	636✔
200	if ((c & 0xC0) != 0x80) {	636✔
201	return {};	×
202	}	×
203	auto u = ((int_val << 6) + (traits::to_int_type(c) & 0x3F)) & 0x7FF;	636✔
204	// Handle some Latin-1 supplement characters	318✔
205	if (upper && (u >= 0xE0 && u <= 0xFE && u != 0xF7)) {	636✔
206	u -= 0x20;	270✔
207	}	270✔
208	else if (!upper && (u >= 0xC0 && u <= 0xDE && u != 0xD7)) {	366✔
209	u += 0x20;	180✔
210	}	180✔
211		318✔
212	result[i++] = static_cast<char>((u >> 6) \| 0xC0);	636✔
213	c = static_cast<char>((u & 0x3f) \| 0x80);	636✔
214	}	636✔
215	else if ((int_val & 0xF0) == 0xE0) {	192✔
216	// 3 byte utf-8	72✔
217	if (!copy_bytes(3)) {	144✔
218	return {};	×
219	}	×
220	}	48✔
221	else if ((int_val & 0xF8) == 0xF0) {	48✔
222	// 4 byte utf-8	24✔
223	if (!copy_bytes(4)) {	48✔
224	return {};	6✔
225	}	6✔
UNCOV 226	}	×
UNCOV 227	else {	×
UNCOV 228	return {};	×
UNCOV 229	}	×
230	}	18,495,993✔
231	result[i] = c;	18,495,993✔
232	}	18,495,993✔
233	return result;	218,577✔
234	#endif	218,580✔
235	}	218,580✔
236
237	std::string case_map(StringData source, bool upper, IgnoreErrorsTag)
238	{	111,309✔
239	return case_map(source, upper).value_or("");	111,309✔
240	}	111,309✔
241
242	// If needle == haystack, return true. NOTE: This function first
243	// performs a case insensitive byte compare instead of one whole
244	// UTF-8 character at a time. This is very fast, but not enough to
245	// guarantee that the strings are identical, so we need to finish off
246	// with a slower but rigorous comparison. The signature is similar in
247	// spirit to std::equal().
248	bool equal_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower)
249	{	105,432✔
250	for (size_t i = 0; i != haystack.size(); ++i) {	153,978✔
251	char c = haystack[i];	67,362✔
252	if (needle_lower[i] != c && needle_upper[i] != c)	67,362✔
253	return false;	18,816✔
254	}	67,362✔
255		52,716✔
256	const char* begin = haystack.data();	96,024✔
257	const char* end = begin + haystack.size();	86,616✔
258	const char* i = begin;	86,616✔
259	while (i != end) {	108,654✔
260	if (!equal_sequence(i, end, needle_lower + (i - begin)) &&	22,038✔
261	!equal_sequence(i, end, needle_upper + (i - begin)))	11,481✔
262	return false;	×
263	}	22,038✔
264	return true;	86,616✔
265	}	86,616✔
266
267
268	// Test if needle is a substring of haystack. The signature is similar
269	// in spirit to std::search().
270	size_t search_case_fold(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size)
271	{	7,602✔
272	// FIXME: This solution is very inefficient. Consider deploying the Boyer-Moore algorithm.	3,801✔
273	size_t i = 0;	7,602✔
274	while (needle_size <= haystack.size() - i) {	22,782✔
275	if (equal_case_fold(haystack.substr(i, needle_size), needle_upper, needle_lower)) {	17,334✔
276	return i;	2,154✔
277	}	2,154✔
278	++i;	15,180✔
279	}	15,180✔
280	return haystack.size(); // Not found	6,525✔
281	}	7,602✔
282
283	/// This method takes an array that maps chars (both upper- and lowercase) to distance that can be moved
284	/// (and zero for chars not in needle), allowing the method to apply Boyer-Moore for quick substring search
285	/// The map is calculated in the StringNode<ContainsIns> class (so it can be reused across searches)
286	bool contains_ins(StringData haystack, const char* needle_upper, const char* needle_lower, size_t needle_size,
287	const std::array<uint8_t, 256>& charmap)
288	{	7,422✔
289	if (needle_size == 0)	7,422✔
290	return haystack.size() != 0;	×
291		3,711✔
292	// Prepare vars to avoid lookups in loop	3,711✔
293	size_t last_char_pos = needle_size - 1;	7,422✔
294	unsigned char lastCharU = needle_upper[last_char_pos];	7,422✔
295	unsigned char lastCharL = needle_lower[last_char_pos];	7,422✔
296		3,711✔
297	// Do Boyer-Moore search	3,711✔
298	size_t p = last_char_pos;	7,422✔
299	while (p < haystack.size()) {	14,340✔
300	unsigned char c = haystack.data()[p]; // Get candidate for last char	7,740✔
301		3,870✔
302	if (c == lastCharU \|\| c == lastCharL) {	7,740✔
303	StringData candidate = haystack.substr(p - needle_size + 1, needle_size);	870✔
304	if (equal_case_fold(candidate, needle_upper, needle_lower))	870✔
305	return true; // text found!	822✔
306	}	6,918✔
307		3,459✔
308	// If we don't have a match, see how far we can move char_pos	3,459✔
309	if (charmap[c] == 0)	6,918✔
310	p += needle_size; // char was not present in search string	6,696✔
311	else	222✔
312	p += charmap[c];	222✔
313	}	6,918✔
314		3,711✔
315	return false;	7,011✔
316	}	7,422✔
317
318	bool string_like_ins(StringData text, StringData upper, StringData lower) noexcept
319	{	10,578✔
320	if (text.is_null() \|\| lower.is_null()) {	10,578✔
321	return (text.is_null() && lower.is_null());	×
322	}	×
323		5,289✔
324	return StringData::matchlike_ins(text, lower, upper);	10,578✔
325	}	10,578✔
326
327	bool string_like_ins(StringData text, StringData pattern) noexcept
328	{	222✔
329	if (text.is_null() \|\| pattern.is_null()) {	222✔
330	return (text.is_null() && pattern.is_null());	30✔
331	}	30✔
332		96✔
333	std::string upper = case_map(pattern, true, IgnoreErrors);	192✔
334	std::string lower = case_map(pattern, false, IgnoreErrors);	192✔
335		96✔
336	return StringData::matchlike_ins(text, lower.c_str(), upper.c_str());	192✔
337	}	192✔
338
339	} // namespace realm

realm / realm-core / james.stone_381

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous