| // Copyright 2011 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "base/i18n/string_search.h" |
| |
| #include <stddef.h> |
| |
| #include <string> |
| #include <vector> |
| |
| #include "base/i18n/rtl.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "testing/gtest/include/gtest/gtest.h" |
| #include "third_party/icu/source/i18n/unicode/usearch.h" |
| |
| namespace base { |
| namespace i18n { |
| |
| #if !defined(UCONFIG_NO_COLLATION) |
| |
| #define EXPECT_MATCH_IGNORE_CASE(find_this, in_this, ex_start, ex_len) \ |
| { \ |
| size_t index = 0; \ |
| size_t length = 0; \ |
| EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(find_this, in_this, &index, \ |
| &length)); \ |
| EXPECT_EQ(ex_start, index); \ |
| EXPECT_EQ(ex_len, length); \ |
| index = 0; \ |
| length = 0; \ |
| EXPECT_TRUE( \ |
| StringSearch(find_this, in_this, &index, &length, false, true)); \ |
| EXPECT_EQ(ex_start, index); \ |
| EXPECT_EQ(ex_len, length); \ |
| } |
| |
| #define EXPECT_MATCH_SENSITIVE(find_this, in_this, ex_start, ex_len) \ |
| { \ |
| size_t index = 0; \ |
| size_t length = 0; \ |
| EXPECT_TRUE( \ |
| StringSearch(find_this, in_this, &index, &length, true, true)); \ |
| EXPECT_EQ(ex_start, index); \ |
| EXPECT_EQ(ex_len, length); \ |
| } |
| |
| #define EXPECT_MATCH_IGNORE_CASE_BACKWARDS(find_this, in_this, ex_start, \ |
| ex_len) \ |
| { \ |
| size_t index = 0; \ |
| size_t length = 0; \ |
| EXPECT_TRUE( \ |
| StringSearch(find_this, in_this, &index, &length, false, false)); \ |
| EXPECT_EQ(ex_start, index); \ |
| EXPECT_EQ(ex_len, length); \ |
| } |
| |
| #define EXPECT_MATCH_SENSITIVE_BACKWARDS(find_this, in_this, ex_start, ex_len) \ |
| { \ |
| size_t index = 0; \ |
| size_t length = 0; \ |
| EXPECT_TRUE( \ |
| StringSearch(find_this, in_this, &index, &length, true, false)); \ |
| EXPECT_EQ(ex_start, index); \ |
| EXPECT_EQ(ex_len, length); \ |
| } |
| |
| #define EXPECT_MISS_IGNORE_CASE(find_this, in_this) \ |
| { \ |
| size_t index = 0; \ |
| size_t length = 0; \ |
| EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(find_this, in_this, \ |
| &index, &length)); \ |
| index = 0; \ |
| length = 0; \ |
| EXPECT_FALSE( \ |
| StringSearch(find_this, in_this, &index, &length, false, true)); \ |
| } |
| |
| #define EXPECT_MISS_SENSITIVE(find_this, in_this) \ |
| { \ |
| size_t index = 0; \ |
| size_t length = 0; \ |
| EXPECT_FALSE( \ |
| StringSearch(find_this, in_this, &index, &length, true, true)); \ |
| } |
| |
| #define EXPECT_MISS_IGNORE_CASE_BACKWARDS(find_this, in_this) \ |
| { \ |
| size_t index = 0; \ |
| size_t length = 0; \ |
| EXPECT_FALSE( \ |
| StringSearch(find_this, in_this, &index, &length, false, false)); \ |
| } |
| |
| #define EXPECT_MISS_SENSITIVE_BACKWARDS(find_this, in_this) \ |
| { \ |
| size_t index = 0; \ |
| size_t length = 0; \ |
| EXPECT_FALSE( \ |
| StringSearch(find_this, in_this, &index, &length, true, false)); \ |
| } |
| |
| // Note on setting default locale for testing: The current default locale on |
| // the Mac trybot is en_US_POSIX, with which primary-level collation strength |
| // string search is case-sensitive, when normally it should be |
| // case-insensitive. In other locales (including en_US which English speakers |
| // in the U.S. use), this search would be case-insensitive as expected. |
| |
| TEST(StringSearchTest, ASCII) { |
| std::string default_locale(uloc_getDefault()); |
| bool locale_is_posix = (default_locale == "en_US_POSIX"); |
| if (locale_is_posix) |
| SetICUDefaultLocale("en_US"); |
| |
| EXPECT_MATCH_IGNORE_CASE(u"hello", u"hello world", 0U, 5U); |
| |
| EXPECT_MISS_IGNORE_CASE(u"h e l l o", u"h e l l o"); |
| |
| EXPECT_MATCH_IGNORE_CASE(u"aabaaa", u"aaabaabaaa", 4U, 6U); |
| |
| EXPECT_MISS_IGNORE_CASE(u"searching within empty string", std::u16string()); |
| |
| EXPECT_MATCH_IGNORE_CASE(std::u16string(), u"searching for empty string", 0U, |
| 0U); |
| |
| EXPECT_MATCH_IGNORE_CASE(u"case insensitivity", u"CaSe InSeNsItIvItY", 0U, |
| 18U); |
| |
| EXPECT_MATCH_SENSITIVE(u"aabaaa", u"aaabaabaaa", 4U, 6U); |
| |
| EXPECT_MISS_SENSITIVE(u"searching within empty string", std::u16string()); |
| |
| EXPECT_MATCH_SENSITIVE(std::u16string(), u"searching for empty string", 0U, |
| 0U); |
| |
| EXPECT_MISS_SENSITIVE(u"case insensitivity", u"CaSe InSeNsItIvItY"); |
| |
| if (locale_is_posix) |
| SetICUDefaultLocale(default_locale.data()); |
| } |
| |
| TEST(StringSearchTest, UnicodeLocaleIndependent) { |
| // Base characters |
| const std::u16string e_base = u"e"; |
| const std::u16string E_base = u"E"; |
| const std::u16string a_base = u"a"; |
| |
| // Composed characters |
| const std::u16string e_with_acute_accent = u"\u00e9"; |
| const std::u16string E_with_acute_accent = u"\u00c9"; |
| const std::u16string e_with_grave_accent = u"\u00e8"; |
| const std::u16string E_with_grave_accent = u"\u00c8"; |
| const std::u16string a_with_acute_accent = u"\u00e1"; |
| |
| // Decomposed characters |
| const std::u16string e_with_acute_combining_mark = u"e\u0301"; |
| const std::u16string E_with_acute_combining_mark = u"E\u0301"; |
| const std::u16string e_with_grave_combining_mark = u"e\u0300"; |
| const std::u16string E_with_grave_combining_mark = u"E\u0300"; |
| const std::u16string a_with_acute_combining_mark = u"a\u0301"; |
| |
| std::string default_locale(uloc_getDefault()); |
| bool locale_is_posix = (default_locale == "en_US_POSIX"); |
| if (locale_is_posix) |
| SetICUDefaultLocale("en_US"); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_accent, 0U, |
| e_with_acute_accent.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_base, 0U, e_base.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_combining_mark, 0U, |
| e_with_acute_combining_mark.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_base, 0U, |
| e_base.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_acute_accent, 0U, |
| e_with_acute_accent.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_with_acute_combining_mark, 0U, |
| e_with_acute_combining_mark.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, |
| e_with_grave_combining_mark, 0U, |
| e_with_grave_combining_mark.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_with_grave_combining_mark, |
| e_with_acute_combining_mark, 0U, |
| e_with_acute_combining_mark.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_grave_accent, 0U, |
| e_with_grave_accent.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(e_with_grave_accent, e_with_acute_combining_mark, 0U, |
| e_with_acute_combining_mark.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(E_with_acute_accent, e_with_acute_accent, 0U, |
| e_with_acute_accent.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(E_with_grave_accent, e_with_acute_accent, 0U, |
| e_with_acute_accent.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(E_with_acute_combining_mark, e_with_grave_accent, 0U, |
| e_with_grave_accent.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(E_with_grave_combining_mark, e_with_acute_accent, 0U, |
| e_with_acute_accent.size()); |
| |
| EXPECT_MATCH_IGNORE_CASE(E_base, e_with_grave_accent, 0U, |
| e_with_grave_accent.size()); |
| |
| EXPECT_MISS_IGNORE_CASE(a_with_acute_accent, e_with_acute_accent); |
| |
| EXPECT_MISS_IGNORE_CASE(a_with_acute_combining_mark, |
| e_with_acute_combining_mark); |
| |
| EXPECT_MISS_SENSITIVE(e_base, e_with_acute_accent); |
| |
| EXPECT_MISS_SENSITIVE(e_with_acute_accent, e_base); |
| |
| EXPECT_MISS_SENSITIVE(e_base, e_with_acute_combining_mark); |
| |
| EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_base); |
| |
| EXPECT_MATCH_SENSITIVE(e_with_acute_combining_mark, e_with_acute_accent, 0U, |
| 1U); |
| |
| EXPECT_MATCH_SENSITIVE(e_with_acute_accent, e_with_acute_combining_mark, 0U, |
| 2U); |
| |
| EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, |
| e_with_grave_combining_mark); |
| |
| EXPECT_MISS_SENSITIVE(e_with_grave_combining_mark, |
| e_with_acute_combining_mark); |
| |
| EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_with_grave_accent); |
| |
| EXPECT_MISS_SENSITIVE(e_with_grave_accent, e_with_acute_combining_mark); |
| |
| EXPECT_MISS_SENSITIVE(E_with_acute_accent, e_with_acute_accent); |
| |
| EXPECT_MISS_SENSITIVE(E_with_grave_accent, e_with_acute_accent); |
| |
| EXPECT_MISS_SENSITIVE(E_with_acute_combining_mark, e_with_grave_accent); |
| |
| EXPECT_MISS_SENSITIVE(E_with_grave_combining_mark, e_with_acute_accent); |
| |
| EXPECT_MISS_SENSITIVE(E_base, e_with_grave_accent); |
| |
| EXPECT_MISS_SENSITIVE(a_with_acute_accent, e_with_acute_accent); |
| |
| EXPECT_MISS_SENSITIVE(a_with_acute_combining_mark, |
| e_with_acute_combining_mark); |
| |
| EXPECT_MATCH_SENSITIVE(a_with_acute_combining_mark, |
| a_with_acute_combining_mark, 0U, 2U); |
| |
| if (locale_is_posix) |
| SetICUDefaultLocale(default_locale.data()); |
| } |
| |
| TEST(StringSearchTest, UnicodeLocaleDependent) { |
| // Base characters |
| const std::u16string a_base = u"a"; |
| |
| // Composed characters |
| const std::u16string a_with_ring = u"\u00e5"; |
| |
| EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr, |
| nullptr)); |
| EXPECT_TRUE(StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true)); |
| |
| const char* default_locale = uloc_getDefault(); |
| SetICUDefaultLocale("da"); |
| |
| EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr, |
| nullptr)); |
| EXPECT_FALSE( |
| StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true)); |
| |
| SetICUDefaultLocale(default_locale); |
| } |
| |
| TEST(StringSearchTest, SearchBackwards) { |
| std::string default_locale(uloc_getDefault()); |
| bool locale_is_posix = (default_locale == "en_US_POSIX"); |
| if (locale_is_posix) |
| SetICUDefaultLocale("en_US"); |
| |
| EXPECT_MATCH_IGNORE_CASE_BACKWARDS(u"ab", u"ABAB", 2U, 2U); |
| EXPECT_MATCH_SENSITIVE_BACKWARDS(u"ab", u"abab", 2U, 2U); |
| EXPECT_MISS_SENSITIVE_BACKWARDS(u"ab", u"ABAB"); |
| |
| if (locale_is_posix) |
| SetICUDefaultLocale(default_locale.data()); |
| } |
| |
| TEST(StringSearchTest, FixedPatternMultipleSearch) { |
| std::string default_locale(uloc_getDefault()); |
| bool locale_is_posix = (default_locale == "en_US_POSIX"); |
| if (locale_is_posix) |
| SetICUDefaultLocale("en_US"); |
| |
| size_t index = 0; |
| size_t length = 0; |
| |
| // Search "foo" over multiple texts. |
| FixedPatternStringSearch query1(u"foo", true); |
| EXPECT_TRUE(query1.Search(u"12foo34", &index, &length, true)); |
| EXPECT_EQ(2U, index); |
| EXPECT_EQ(3U, length); |
| EXPECT_FALSE(query1.Search(u"bye", &index, &length, true)); |
| EXPECT_FALSE(query1.Search(u"FOO", &index, &length, true)); |
| EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, true)); |
| EXPECT_EQ(0U, index); |
| EXPECT_EQ(3U, length); |
| EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, false)); |
| EXPECT_EQ(6U, index); |
| EXPECT_EQ(3U, length); |
| |
| // Search "hello" over multiple texts. |
| FixedPatternStringSearchIgnoringCaseAndAccents query2(u"hello"); |
| EXPECT_TRUE(query2.Search(u"12hello34", &index, &length)); |
| EXPECT_EQ(2U, index); |
| EXPECT_EQ(5U, length); |
| EXPECT_FALSE(query2.Search(u"bye", &index, &length)); |
| EXPECT_TRUE(query2.Search(u"hELLo", &index, &length)); |
| EXPECT_EQ(0U, index); |
| EXPECT_EQ(5U, length); |
| |
| if (locale_is_posix) |
| SetICUDefaultLocale(default_locale.data()); |
| } |
| |
| TEST(StringSearchTest, RepeatingStringSearch) { |
| struct MatchResult { |
| int match_index; |
| int match_length; |
| }; |
| |
| std::string default_locale(uloc_getDefault()); |
| bool locale_is_posix = (default_locale == "en_US_POSIX"); |
| if (locale_is_posix) |
| SetICUDefaultLocale("en_US"); |
| |
| const char16_t kPattern[] = u"fox"; |
| const char16_t kTarget[] = u"The quick brown fox jumped over the lazy Fox"; |
| |
| // Case sensitive. |
| { |
| const MatchResult kExpectation[] = {{16, 3}}; |
| |
| RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/true); |
| std::vector<MatchResult> results; |
| int match_index; |
| int match_length; |
| while (searcher.NextMatchResult(match_index, match_length)) { |
| results.push_back( |
| {.match_index = match_index, .match_length = match_length}); |
| } |
| |
| ASSERT_EQ(std::size(kExpectation), results.size()); |
| for (size_t i = 0; i < results.size(); ++i) { |
| EXPECT_EQ(results[i].match_index, kExpectation[i].match_index); |
| EXPECT_EQ(results[i].match_length, kExpectation[i].match_length); |
| } |
| } |
| |
| // Case insensitive. |
| { |
| const MatchResult kExpectation[] = {{16, 3}, {41, 3}}; |
| |
| RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/false); |
| std::vector<MatchResult> results; |
| int match_index; |
| int match_length; |
| while (searcher.NextMatchResult(match_index, match_length)) { |
| results.push_back( |
| {.match_index = match_index, .match_length = match_length}); |
| } |
| |
| ASSERT_EQ(std::size(kExpectation), results.size()); |
| for (size_t i = 0; i < results.size(); ++i) { |
| EXPECT_EQ(results[i].match_index, kExpectation[i].match_index); |
| EXPECT_EQ(results[i].match_length, kExpectation[i].match_length); |
| } |
| } |
| |
| if (locale_is_posix) |
| SetICUDefaultLocale(default_locale.data()); |
| } |
| |
| #endif |
| |
| } // namespace i18n |
| } // namespace base |