blob: 0c5a1acf219eefb8a602f86a1aadd501a2813637 [file] [log] [blame]
// Copyright 2011 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/i18n/string_search.h"
#include <stddef.h>
#include <string>
#include <vector>
#include "base/i18n/rtl.h"
#include "base/strings/utf_string_conversions.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/icu/source/i18n/unicode/usearch.h"
namespace base {
namespace i18n {
#if !defined(UCONFIG_NO_COLLATION)
#define EXPECT_MATCH_IGNORE_CASE(find_this, in_this, ex_start, ex_len) \
{ \
size_t index = 0; \
size_t length = 0; \
EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(find_this, in_this, &index, \
&length)); \
EXPECT_EQ(ex_start, index); \
EXPECT_EQ(ex_len, length); \
index = 0; \
length = 0; \
EXPECT_TRUE( \
StringSearch(find_this, in_this, &index, &length, false, true)); \
EXPECT_EQ(ex_start, index); \
EXPECT_EQ(ex_len, length); \
}
#define EXPECT_MATCH_SENSITIVE(find_this, in_this, ex_start, ex_len) \
{ \
size_t index = 0; \
size_t length = 0; \
EXPECT_TRUE( \
StringSearch(find_this, in_this, &index, &length, true, true)); \
EXPECT_EQ(ex_start, index); \
EXPECT_EQ(ex_len, length); \
}
#define EXPECT_MATCH_IGNORE_CASE_BACKWARDS(find_this, in_this, ex_start, \
ex_len) \
{ \
size_t index = 0; \
size_t length = 0; \
EXPECT_TRUE( \
StringSearch(find_this, in_this, &index, &length, false, false)); \
EXPECT_EQ(ex_start, index); \
EXPECT_EQ(ex_len, length); \
}
#define EXPECT_MATCH_SENSITIVE_BACKWARDS(find_this, in_this, ex_start, ex_len) \
{ \
size_t index = 0; \
size_t length = 0; \
EXPECT_TRUE( \
StringSearch(find_this, in_this, &index, &length, true, false)); \
EXPECT_EQ(ex_start, index); \
EXPECT_EQ(ex_len, length); \
}
#define EXPECT_MISS_IGNORE_CASE(find_this, in_this) \
{ \
size_t index = 0; \
size_t length = 0; \
EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(find_this, in_this, \
&index, &length)); \
index = 0; \
length = 0; \
EXPECT_FALSE( \
StringSearch(find_this, in_this, &index, &length, false, true)); \
}
#define EXPECT_MISS_SENSITIVE(find_this, in_this) \
{ \
size_t index = 0; \
size_t length = 0; \
EXPECT_FALSE( \
StringSearch(find_this, in_this, &index, &length, true, true)); \
}
#define EXPECT_MISS_IGNORE_CASE_BACKWARDS(find_this, in_this) \
{ \
size_t index = 0; \
size_t length = 0; \
EXPECT_FALSE( \
StringSearch(find_this, in_this, &index, &length, false, false)); \
}
#define EXPECT_MISS_SENSITIVE_BACKWARDS(find_this, in_this) \
{ \
size_t index = 0; \
size_t length = 0; \
EXPECT_FALSE( \
StringSearch(find_this, in_this, &index, &length, true, false)); \
}
// Note on setting default locale for testing: The current default locale on
// the Mac trybot is en_US_POSIX, with which primary-level collation strength
// string search is case-sensitive, when normally it should be
// case-insensitive. In other locales (including en_US which English speakers
// in the U.S. use), this search would be case-insensitive as expected.
TEST(StringSearchTest, ASCII) {
std::string default_locale(uloc_getDefault());
bool locale_is_posix = (default_locale == "en_US_POSIX");
if (locale_is_posix)
SetICUDefaultLocale("en_US");
EXPECT_MATCH_IGNORE_CASE(u"hello", u"hello world", 0U, 5U);
EXPECT_MISS_IGNORE_CASE(u"h e l l o", u"h e l l o");
EXPECT_MATCH_IGNORE_CASE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
EXPECT_MISS_IGNORE_CASE(u"searching within empty string", std::u16string());
EXPECT_MATCH_IGNORE_CASE(std::u16string(), u"searching for empty string", 0U,
0U);
EXPECT_MATCH_IGNORE_CASE(u"case insensitivity", u"CaSe InSeNsItIvItY", 0U,
18U);
EXPECT_MATCH_SENSITIVE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
EXPECT_MISS_SENSITIVE(u"searching within empty string", std::u16string());
EXPECT_MATCH_SENSITIVE(std::u16string(), u"searching for empty string", 0U,
0U);
EXPECT_MISS_SENSITIVE(u"case insensitivity", u"CaSe InSeNsItIvItY");
if (locale_is_posix)
SetICUDefaultLocale(default_locale.data());
}
TEST(StringSearchTest, UnicodeLocaleIndependent) {
// Base characters
const std::u16string e_base = u"e";
const std::u16string E_base = u"E";
const std::u16string a_base = u"a";
// Composed characters
const std::u16string e_with_acute_accent = u"\u00e9";
const std::u16string E_with_acute_accent = u"\u00c9";
const std::u16string e_with_grave_accent = u"\u00e8";
const std::u16string E_with_grave_accent = u"\u00c8";
const std::u16string a_with_acute_accent = u"\u00e1";
// Decomposed characters
const std::u16string e_with_acute_combining_mark = u"e\u0301";
const std::u16string E_with_acute_combining_mark = u"E\u0301";
const std::u16string e_with_grave_combining_mark = u"e\u0300";
const std::u16string E_with_grave_combining_mark = u"E\u0300";
const std::u16string a_with_acute_combining_mark = u"a\u0301";
std::string default_locale(uloc_getDefault());
bool locale_is_posix = (default_locale == "en_US_POSIX");
if (locale_is_posix)
SetICUDefaultLocale("en_US");
EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_accent, 0U,
e_with_acute_accent.size());
EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_base, 0U, e_base.size());
EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_combining_mark, 0U,
e_with_acute_combining_mark.size());
EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_base, 0U,
e_base.size());
EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
e_with_acute_accent.size());
EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
e_with_acute_combining_mark.size());
EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark,
e_with_grave_combining_mark, 0U,
e_with_grave_combining_mark.size());
EXPECT_MATCH_IGNORE_CASE(e_with_grave_combining_mark,
e_with_acute_combining_mark, 0U,
e_with_acute_combining_mark.size());
EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_grave_accent, 0U,
e_with_grave_accent.size());
EXPECT_MATCH_IGNORE_CASE(e_with_grave_accent, e_with_acute_combining_mark, 0U,
e_with_acute_combining_mark.size());
EXPECT_MATCH_IGNORE_CASE(E_with_acute_accent, e_with_acute_accent, 0U,
e_with_acute_accent.size());
EXPECT_MATCH_IGNORE_CASE(E_with_grave_accent, e_with_acute_accent, 0U,
e_with_acute_accent.size());
EXPECT_MATCH_IGNORE_CASE(E_with_acute_combining_mark, e_with_grave_accent, 0U,
e_with_grave_accent.size());
EXPECT_MATCH_IGNORE_CASE(E_with_grave_combining_mark, e_with_acute_accent, 0U,
e_with_acute_accent.size());
EXPECT_MATCH_IGNORE_CASE(E_base, e_with_grave_accent, 0U,
e_with_grave_accent.size());
EXPECT_MISS_IGNORE_CASE(a_with_acute_accent, e_with_acute_accent);
EXPECT_MISS_IGNORE_CASE(a_with_acute_combining_mark,
e_with_acute_combining_mark);
EXPECT_MISS_SENSITIVE(e_base, e_with_acute_accent);
EXPECT_MISS_SENSITIVE(e_with_acute_accent, e_base);
EXPECT_MISS_SENSITIVE(e_base, e_with_acute_combining_mark);
EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_base);
EXPECT_MATCH_SENSITIVE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
1U);
EXPECT_MATCH_SENSITIVE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
2U);
EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark,
e_with_grave_combining_mark);
EXPECT_MISS_SENSITIVE(e_with_grave_combining_mark,
e_with_acute_combining_mark);
EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_with_grave_accent);
EXPECT_MISS_SENSITIVE(e_with_grave_accent, e_with_acute_combining_mark);
EXPECT_MISS_SENSITIVE(E_with_acute_accent, e_with_acute_accent);
EXPECT_MISS_SENSITIVE(E_with_grave_accent, e_with_acute_accent);
EXPECT_MISS_SENSITIVE(E_with_acute_combining_mark, e_with_grave_accent);
EXPECT_MISS_SENSITIVE(E_with_grave_combining_mark, e_with_acute_accent);
EXPECT_MISS_SENSITIVE(E_base, e_with_grave_accent);
EXPECT_MISS_SENSITIVE(a_with_acute_accent, e_with_acute_accent);
EXPECT_MISS_SENSITIVE(a_with_acute_combining_mark,
e_with_acute_combining_mark);
EXPECT_MATCH_SENSITIVE(a_with_acute_combining_mark,
a_with_acute_combining_mark, 0U, 2U);
if (locale_is_posix)
SetICUDefaultLocale(default_locale.data());
}
TEST(StringSearchTest, UnicodeLocaleDependent) {
// Base characters
const std::u16string a_base = u"a";
// Composed characters
const std::u16string a_with_ring = u"\u00e5";
EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
nullptr));
EXPECT_TRUE(StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
const char* default_locale = uloc_getDefault();
SetICUDefaultLocale("da");
EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
nullptr));
EXPECT_FALSE(
StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
SetICUDefaultLocale(default_locale);
}
TEST(StringSearchTest, SearchBackwards) {
std::string default_locale(uloc_getDefault());
bool locale_is_posix = (default_locale == "en_US_POSIX");
if (locale_is_posix)
SetICUDefaultLocale("en_US");
EXPECT_MATCH_IGNORE_CASE_BACKWARDS(u"ab", u"ABAB", 2U, 2U);
EXPECT_MATCH_SENSITIVE_BACKWARDS(u"ab", u"abab", 2U, 2U);
EXPECT_MISS_SENSITIVE_BACKWARDS(u"ab", u"ABAB");
if (locale_is_posix)
SetICUDefaultLocale(default_locale.data());
}
TEST(StringSearchTest, FixedPatternMultipleSearch) {
std::string default_locale(uloc_getDefault());
bool locale_is_posix = (default_locale == "en_US_POSIX");
if (locale_is_posix)
SetICUDefaultLocale("en_US");
size_t index = 0;
size_t length = 0;
// Search "foo" over multiple texts.
FixedPatternStringSearch query1(u"foo", true);
EXPECT_TRUE(query1.Search(u"12foo34", &index, &length, true));
EXPECT_EQ(2U, index);
EXPECT_EQ(3U, length);
EXPECT_FALSE(query1.Search(u"bye", &index, &length, true));
EXPECT_FALSE(query1.Search(u"FOO", &index, &length, true));
EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, true));
EXPECT_EQ(0U, index);
EXPECT_EQ(3U, length);
EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, false));
EXPECT_EQ(6U, index);
EXPECT_EQ(3U, length);
// Search "hello" over multiple texts.
FixedPatternStringSearchIgnoringCaseAndAccents query2(u"hello");
EXPECT_TRUE(query2.Search(u"12hello34", &index, &length));
EXPECT_EQ(2U, index);
EXPECT_EQ(5U, length);
EXPECT_FALSE(query2.Search(u"bye", &index, &length));
EXPECT_TRUE(query2.Search(u"hELLo", &index, &length));
EXPECT_EQ(0U, index);
EXPECT_EQ(5U, length);
if (locale_is_posix)
SetICUDefaultLocale(default_locale.data());
}
TEST(StringSearchTest, RepeatingStringSearch) {
struct MatchResult {
int match_index;
int match_length;
};
std::string default_locale(uloc_getDefault());
bool locale_is_posix = (default_locale == "en_US_POSIX");
if (locale_is_posix)
SetICUDefaultLocale("en_US");
const char16_t kPattern[] = u"fox";
const char16_t kTarget[] = u"The quick brown fox jumped over the lazy Fox";
// Case sensitive.
{
const MatchResult kExpectation[] = {{16, 3}};
RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/true);
std::vector<MatchResult> results;
int match_index;
int match_length;
while (searcher.NextMatchResult(match_index, match_length)) {
results.push_back(
{.match_index = match_index, .match_length = match_length});
}
ASSERT_EQ(std::size(kExpectation), results.size());
for (size_t i = 0; i < results.size(); ++i) {
EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
}
}
// Case insensitive.
{
const MatchResult kExpectation[] = {{16, 3}, {41, 3}};
RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/false);
std::vector<MatchResult> results;
int match_index;
int match_length;
while (searcher.NextMatchResult(match_index, match_length)) {
results.push_back(
{.match_index = match_index, .match_length = match_length});
}
ASSERT_EQ(std::size(kExpectation), results.size());
for (size_t i = 0; i < results.size(); ++i) {
EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
}
}
if (locale_is_posix)
SetICUDefaultLocale(default_locale.data());
}
#endif
} // namespace i18n
} // namespace base