| // Copyright 2020 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "base/strings/escape.h" |
| |
| #include <ostream> |
| |
| #include "base/check_op.h" |
| #include "base/feature_list.h" |
| #include "base/features.h" |
| #include "base/strings/string_piece.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/utf_string_conversion_utils.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "base/third_party/icu/icu_utf.h" |
| |
| namespace base { |
| |
| namespace { |
| |
| const char kHexString[] = "0123456789ABCDEF"; |
| inline char IntToHex(int i) { |
| DCHECK_GE(i, 0) << i << " not a hex value"; |
| DCHECK_LE(i, 15) << i << " not a hex value"; |
| return kHexString[i]; |
| } |
| |
| // A fast bit-vector map for ascii characters. |
| // |
| // Internally stores 256 bits in an array of 8 ints. |
| // Does quick bit-flicking to lookup needed characters. |
| struct Charmap { |
| bool Contains(unsigned char c) const { |
| return ((map[c >> 5] & (1 << (c & 31))) != 0); |
| } |
| |
| uint32_t map[8]; |
| }; |
| |
| // Given text to escape and a Charmap defining which values to escape, |
| // return an escaped string. If use_plus is true, spaces are converted |
| // to +, otherwise, if spaces are in the charmap, they are converted to |
| // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if |
| // '%' is in the charmap, it is converted to %25. |
| std::string Escape(StringPiece text, |
| const Charmap& charmap, |
| bool use_plus, |
| bool keep_escaped = false) { |
| std::string escaped; |
| escaped.reserve(text.length() * 3); |
| for (size_t i = 0; i < text.length(); ++i) { |
| unsigned char c = static_cast<unsigned char>(text[i]); |
| if (use_plus && ' ' == c) { |
| escaped.push_back('+'); |
| } else if (keep_escaped && '%' == c && i + 2 < text.length() && |
| IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) { |
| escaped.push_back('%'); |
| } else if (charmap.Contains(c)) { |
| escaped.push_back('%'); |
| escaped.push_back(IntToHex(c >> 4)); |
| escaped.push_back(IntToHex(c & 0xf)); |
| } else { |
| escaped.push_back(static_cast<char>(c)); |
| } |
| } |
| return escaped; |
| } |
| |
| // Convert a character |c| to a form that will not be mistaken as HTML. |
| template <class str> |
| void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { |
| static constexpr struct { |
| char key; |
| StringPiece replacement; |
| } kCharsToEscape[] = { |
| {'<', "<"}, {'>', ">"}, {'&', "&"}, |
| {'"', """}, {'\'', "'"}, |
| }; |
| for (const auto& char_to_escape : kCharsToEscape) { |
| if (c == char_to_escape.key) { |
| output->append(std::begin(char_to_escape.replacement), |
| std::end(char_to_escape.replacement)); |
| return; |
| } |
| } |
| output->push_back(c); |
| } |
| |
| // Convert |input| string to a form that will not be interpreted as HTML. |
| template <typename T, typename CharT = typename T::value_type> |
| std::basic_string<CharT> EscapeForHTMLImpl(T input) { |
| std::basic_string<CharT> result; |
| result.reserve(input.size()); // Optimize for no escaping. |
| |
| for (auto c : input) { |
| AppendEscapedCharForHTMLImpl(c, &result); |
| } |
| |
| return result; |
| } |
| |
| // Everything except alphanumerics and -._~ |
| // See RFC 3986 for the list of unreserved characters. |
| static const Charmap kUnreservedCharmap = { |
| {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL, |
| 0xffffffffL, 0xffffffffL, 0xffffffffL}}; |
| |
| // Everything except alphanumerics and !'()*-._~ |
| // See RFC 2396 for the list of reserved characters. |
| static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L, |
| 0xb8000001L, 0xffffffffL, 0xffffffffL, |
| 0xffffffffL, 0xffffffffL}}; |
| |
| // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|} |
| static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L, |
| 0xb8000001L, 0xffffffffL, 0xffffffffL, |
| 0xffffffffL, 0xffffffffL}}; |
| |
| #if BUILDFLAG(IS_APPLE) |
| // non-printable, non-7bit, and (including space) "#%<>[\]^`{|} |
| static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L, |
| 0xb8000001L, 0xffffffffL, 0xffffffffL, |
| 0xffffffffL, 0xffffffffL}}; |
| #endif // BUILDFLAG(IS_APPLE) |
| |
| // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|} |
| static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L, |
| 0xb8000001L, 0xffffffffL, 0xffffffffL, |
| 0xffffffffL, 0xffffffffL}}; |
| |
| // non-7bit, as well as %. |
| static const Charmap kNonASCIICharmapAndPercent = { |
| {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL, |
| 0xffffffffL, 0xffffffffL, 0xffffffffL}}; |
| |
| // non-7bit |
| static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L, |
| 0x00000000L, 0xffffffffL, 0xffffffffL, |
| 0xffffffffL, 0xffffffffL}}; |
| |
| // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and |
| // !'()*-._~#[] |
| static const Charmap kExternalHandlerCharmap = { |
| {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL, |
| 0xffffffffL, 0xffffffffL, 0xffffffffL}}; |
| |
| // Contains nonzero when the corresponding character is unescapable for normal |
| // URLs. These characters are the ones that may change the parsing of a URL, so |
| // we don't want to unescape them sometimes. In many case we won't want to |
| // unescape spaces, but that is controlled by parameters to Unescape*. |
| // |
| // The basic rule is that we can't unescape anything that would changing parsing |
| // like # or ?. We also can't unescape &, =, or + since that could be part of a |
| // query and that could change the server's parsing of the query. Nor can we |
| // unescape \ since src/url/ will convert it to a /. |
| // |
| // Lastly, we can't unescape anything that doesn't have a canonical |
| // representation in a URL. This means that unescaping will change the URL, and |
| // you could get different behavior if you copy and paste the URL, or press |
| // enter in the URL bar. The list of characters that fall into this category |
| // are the ones labeled PASS (allow either escaped or unescaped) in the big |
| // lookup table at the top of url/url_canon_path.cc. Also, characters |
| // that have CHAR_QUERY set in url/url_canon_internal.cc but are not |
| // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are |
| // not unescaped, to avoid turning a valid url according to spec into an |
| // invalid one. |
| // clang-format off |
| const char kUrlUnescape[128] = { |
| // Null, control chars... |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| // ' ' ! " # $ % & ' ( ) * + , - . / |
| 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, |
| // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, |
| // @ A B C D E F G H I J K L M N O |
| 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| // P Q R S T U V W X Y Z [ \ ] ^ _ |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
| // ` a b c d e f g h i j k l m n o |
| 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| // p q r s t u v w x y z { | } ~ <NBSP> |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, |
| }; |
| // clang-format on |
| |
| // Attempts to unescape the sequence at |index| within |escaped_text|. If |
| // successful, sets |value| to the unescaped value. Returns whether |
| // unescaping succeeded. |
| bool UnescapeUnsignedByteAtIndex(StringPiece escaped_text, |
| size_t index, |
| unsigned char* value) { |
| if ((index + 2) >= escaped_text.size()) |
| return false; |
| if (escaped_text[index] != '%') |
| return false; |
| char most_sig_digit(escaped_text[index + 1]); |
| char least_sig_digit(escaped_text[index + 2]); |
| if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { |
| *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 + |
| HexDigitToInt(least_sig_digit)); |
| return true; |
| } |
| return false; |
| } |
| |
| // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at |
| // the specified index. On success, returns true, sets |code_point_out| to be |
| // the character's code point and |unescaped_out| to be the unescaped UTF-8 |
| // string. |unescaped_out| will always be 1/3rd the length of the substring of |
| // |escaped_text| that corresponds to the unescaped character. |
| bool UnescapeUTF8CharacterAtIndex(StringPiece escaped_text, |
| size_t index, |
| base_icu::UChar32* code_point_out, |
| std::string* unescaped_out) { |
| DCHECK(unescaped_out->empty()); |
| |
| unsigned char bytes[CBU8_MAX_LENGTH]; |
| if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0])) |
| return false; |
| |
| size_t num_bytes = 1; |
| |
| // If this is a lead byte, need to collect trail bytes as well. |
| if (CBU8_IS_LEAD(bytes[0])) { |
| // Look for the last trail byte of the UTF-8 character. Give up once |
| // reach max character length number of bytes, or hit an unescaped |
| // character. No need to check length of escaped_text, as |
| // UnescapeUnsignedByteAtIndex checks lengths. |
| while (num_bytes < std::size(bytes) && |
| UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3, |
| &bytes[num_bytes]) && |
| CBU8_IS_TRAIL(bytes[num_bytes])) { |
| ++num_bytes; |
| } |
| } |
| |
| size_t char_index = 0; |
| // Check if the unicode "character" that was just unescaped is valid. |
| if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes, |
| &char_index, code_point_out)) { |
| return false; |
| } |
| |
| // It's possible that a prefix of |bytes| forms a valid UTF-8 character, |
| // and the rest are not valid UTF-8, so need to update |num_bytes| based |
| // on the result of ReadUnicodeCharacter(). |
| num_bytes = char_index + 1; |
| *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes); |
| return true; |
| } |
| |
| // This method takes a Unicode code point and returns true if it should be |
| // unescaped, based on |rules|. |
| bool ShouldUnescapeCodePoint(UnescapeRule::Type rules, |
| base_icu::UChar32 code_point) { |
| // If this is an ASCII character, use the lookup table. |
| if (code_point >= 0 && code_point < 0x80) { |
| return kUrlUnescape[static_cast<size_t>(code_point)] || |
| // Allow some additional unescaping when flags are set. |
| (code_point == ' ' && (rules & UnescapeRule::SPACES)) || |
| // Allow any of the prohibited but non-control characters when doing |
| // "special" chars. |
| ((code_point == '/' || code_point == '\\') && |
| (rules & UnescapeRule::PATH_SEPARATORS)) || |
| (code_point > ' ' && code_point != '/' && code_point != '\\' && |
| (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS)); |
| } |
| |
| // Compare the code point against a list of characters that can be used |
| // to spoof other URLs. |
| // |
| // Can't use icu to make this cleaner, because Cronet cannot depend on |
| // icu, and currently uses this file. |
| // TODO(https://crbug.com/829873): Try to make this use icu, both to |
| // protect against regressions as the Unicode standard is updated and to |
| // reduce the number of long lists of characters. |
| return !( |
| // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi |
| // control characters are not allowed to appear unescaped in URLs. |
| code_point == 0x200E || // LEFT-TO-RIGHT MARK (%E2%80%8E) |
| code_point == 0x200F || // RIGHT-TO-LEFT MARK (%E2%80%8F) |
| code_point == 0x202A || // LEFT-TO-RIGHT EMBEDDING (%E2%80%AA) |
| code_point == 0x202B || // RIGHT-TO-LEFT EMBEDDING (%E2%80%AB) |
| code_point == 0x202C || // POP DIRECTIONAL FORMATTING (%E2%80%AC) |
| code_point == 0x202D || // LEFT-TO-RIGHT OVERRIDE (%E2%80%AD) |
| code_point == 0x202E || // RIGHT-TO-LEFT OVERRIDE (%E2%80%AE) |
| |
| // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has |
| // since added some new BiDi control characters that are not safe to |
| // unescape. http://www.unicode.org/reports/tr9 |
| code_point == 0x061C || // ARABIC LETTER MARK (%D8%9C) |
| code_point == 0x2066 || // LEFT-TO-RIGHT ISOLATE (%E2%81%A6) |
| code_point == 0x2067 || // RIGHT-TO-LEFT ISOLATE (%E2%81%A7) |
| code_point == 0x2068 || // FIRST STRONG ISOLATE (%E2%81%A8) |
| code_point == 0x2069 || // POP DIRECTIONAL ISOLATE (%E2%81%A9) |
| |
| // The following spoofable characters are also banned in unescaped URLs, |
| // because they could be used to imitate parts of a web browser's UI. |
| code_point == 0x1F50F || // LOCK WITH INK PEN (%F0%9F%94%8F) |
| code_point == 0x1F510 || // CLOSED LOCK WITH KEY (%F0%9F%94%90) |
| code_point == 0x1F512 || // LOCK (%F0%9F%94%92) |
| code_point == 0x1F513 || // OPEN LOCK (%F0%9F%94%93) |
| |
| // Spaces are also banned, as they can be used to scroll text out of view. |
| code_point == 0x0085 || // NEXT LINE (%C2%85) |
| code_point == 0x00A0 || // NO-BREAK SPACE (%C2%A0) |
| code_point == 0x1680 || // OGHAM SPACE MARK (%E1%9A%80) |
| code_point == 0x2000 || // EN QUAD (%E2%80%80) |
| code_point == 0x2001 || // EM QUAD (%E2%80%81) |
| code_point == 0x2002 || // EN SPACE (%E2%80%82) |
| code_point == 0x2003 || // EM SPACE (%E2%80%83) |
| code_point == 0x2004 || // THREE-PER-EM SPACE (%E2%80%84) |
| code_point == 0x2005 || // FOUR-PER-EM SPACE (%E2%80%85) |
| code_point == 0x2006 || // SIX-PER-EM SPACE (%E2%80%86) |
| code_point == 0x2007 || // FIGURE SPACE (%E2%80%87) |
| code_point == 0x2008 || // PUNCTUATION SPACE (%E2%80%88) |
| code_point == 0x2009 || // THIN SPACE (%E2%80%89) |
| code_point == 0x200A || // HAIR SPACE (%E2%80%8A) |
| code_point == 0x2028 || // LINE SEPARATOR (%E2%80%A8) |
| code_point == 0x2029 || // PARAGRAPH SEPARATOR (%E2%80%A9) |
| code_point == 0x202F || // NARROW NO-BREAK SPACE (%E2%80%AF) |
| code_point == 0x205F || // MEDIUM MATHEMATICAL SPACE (%E2%81%9F) |
| code_point == 0x3000 || // IDEOGRAPHIC SPACE (%E3%80%80) |
| // U+2800 is rendered as a space, but is not considered whitespace (see |
| // crbug.com/1068531). |
| code_point == 0x2800 || // BRAILLE PATTERN BLANK (%E2%A0%80) |
| |
| // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format |
| // characters ([:Cf:]) are also banned (see crbug.com/824715). |
| code_point == 0x00AD || // SOFT HYPHEN (%C2%AD) |
| code_point == 0x034F || // COMBINING GRAPHEME JOINER (%CD%8F) |
| // Arabic number formatting |
| (code_point >= 0x0600 && code_point <= 0x0605) || |
| // U+061C is already banned as a BiDi control character. |
| code_point == 0x06DD || // ARABIC END OF AYAH (%DB%9D) |
| code_point == 0x070F || // SYRIAC ABBREVIATION MARK (%DC%8F) |
| code_point == 0x08E2 || // ARABIC DISPUTED END OF AYAH (%E0%A3%A2) |
| code_point == 0x115F || // HANGUL CHOSEONG FILLER (%E1%85%9F) |
| code_point == 0x1160 || // HANGUL JUNGSEONG FILLER (%E1%85%A0) |
| code_point == 0x17B4 || // KHMER VOWEL INHERENT AQ (%E1%9E%B4) |
| code_point == 0x17B5 || // KHMER VOWEL INHERENT AA (%E1%9E%B5) |
| code_point == 0x180B || // MONGOLIAN FREE VARIATION SELECTOR ONE |
| // (%E1%A0%8B) |
| code_point == 0x180C || // MONGOLIAN FREE VARIATION SELECTOR TWO |
| // (%E1%A0%8C) |
| code_point == 0x180D || // MONGOLIAN FREE VARIATION SELECTOR THREE |
| // (%E1%A0%8D) |
| code_point == 0x180E || // MONGOLIAN VOWEL SEPARATOR (%E1%A0%8E) |
| code_point == 0x200B || // ZERO WIDTH SPACE (%E2%80%8B) |
| code_point == 0x200C || // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C) |
| code_point == 0x200D || // ZERO WIDTH JOINER (%E2%80%8D) |
| // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as |
| // BiDi control characters. |
| code_point == 0x2060 || // WORD JOINER (%E2%81%A0) |
| code_point == 0x2061 || // FUNCTION APPLICATION (%E2%81%A1) |
| code_point == 0x2062 || // INVISIBLE TIMES (%E2%81%A2) |
| code_point == 0x2063 || // INVISIBLE SEPARATOR (%E2%81%A3) |
| code_point == 0x2064 || // INVISIBLE PLUS (%E2%81%A4) |
| code_point == 0x2065 || // null (%E2%81%A5) |
| // 0x2066--0x2069 are already banned as a BiDi control characters. |
| // General Punctuation - Deprecated (U+206A--206F) |
| (code_point >= 0x206A && code_point <= 0x206F) || |
| code_point == 0x3164 || // HANGUL FILLER (%E3%85%A4) |
| (code_point >= 0xFFF0 && code_point <= 0xFFF8) || // null |
| // Variation selectors (%EF%B8%80 -- %EF%B8%8F) |
| (code_point >= 0xFE00 && code_point <= 0xFE0F) || |
| code_point == 0xFEFF || // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF) |
| code_point == 0xFFA0 || // HALFWIDTH HANGUL FILLER (%EF%BE%A0) |
| code_point == 0xFFF9 || // INTERLINEAR ANNOTATION ANCHOR (%EF%BF%B9) |
| code_point == 0xFFFA || // INTERLINEAR ANNOTATION SEPARATOR (%EF%BF%BA) |
| code_point == 0xFFFB || // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB) |
| code_point == 0x110BD || // KAITHI NUMBER SIGN (%F0%91%82%BD) |
| code_point == 0x110CD || // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D) |
| // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8) |
| (code_point >= 0x13430 && code_point <= 0x13438) || |
| // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3) |
| (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) || |
| // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA) |
| (code_point >= 0x1D173 && code_point <= 0x1D17A) || |
| // Tags, Variation Selectors, nulls |
| (code_point >= 0xE0000 && code_point <= 0xE0FFF)); |
| } |
| |
| // Unescapes |escaped_text| according to |rules|, returning the resulting |
| // string. Fills in an |adjustments| parameter, if non-nullptr, so it reflects |
| // the alterations done to the string that are not one-character-to-one- |
| // character. The resulting |adjustments| will always be sorted by increasing |
| // offset. |
| std::string UnescapeURLWithAdjustmentsImpl( |
| StringPiece escaped_text, |
| UnescapeRule::Type rules, |
| OffsetAdjuster::Adjustments* adjustments) { |
| if (adjustments) |
| adjustments->clear(); |
| // Do not unescape anything, return the |escaped_text| text. |
| if (rules == UnescapeRule::NONE) |
| return std::string(escaped_text); |
| |
| // The output of the unescaping is always smaller than the input, so we can |
| // reserve the input size to make sure we have enough buffer and don't have |
| // to allocate in the loop below. |
| std::string result; |
| result.reserve(escaped_text.length()); |
| |
| // Locations of adjusted text. |
| for (size_t i = 0, max = escaped_text.size(); i < max;) { |
| // Try to unescape the character. |
| base_icu::UChar32 code_point; |
| std::string unescaped; |
| if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point, |
| &unescaped)) { |
| // Check if the next character can be unescaped, but not as a valid UTF-8 |
| // character. In that case, just unescaped and write the non-sense |
| // character. |
| // |
| // TODO(https://crbug.com/829868): Do not unescape illegal UTF-8 |
| // sequences. |
| unsigned char non_utf8_byte; |
| if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) { |
| result.push_back(static_cast<char>(non_utf8_byte)); |
| if (adjustments) |
| adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1)); |
| i += 3; |
| continue; |
| } |
| |
| // Character is not escaped, so append as is, unless it's a '+' and |
| // REPLACE_PLUS_WITH_SPACE is being applied. |
| if (escaped_text[i] == '+' && |
| (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) { |
| result.push_back(' '); |
| } else { |
| result.push_back(escaped_text[i]); |
| } |
| ++i; |
| continue; |
| } |
| |
| DCHECK(!unescaped.empty()); |
| |
| if (!ShouldUnescapeCodePoint(rules, code_point)) { |
| // If it's a valid UTF-8 character, but not safe to unescape, copy all |
| // bytes directly. |
| result.append(escaped_text.begin() + i, |
| escaped_text.begin() + i + 3 * unescaped.length()); |
| i += unescaped.length() * 3; |
| continue; |
| } |
| |
| // If the code point is allowed, and append the entire unescaped character. |
| result.append(unescaped); |
| if (adjustments) { |
| for (size_t j = 0; j < unescaped.length(); ++j) { |
| adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1)); |
| } |
| } |
| i += 3 * unescaped.length(); |
| } |
| |
| return result; |
| } |
| |
| } // namespace |
| |
| std::string EscapeAllExceptUnreserved(StringPiece text) { |
| return Escape(text, kUnreservedCharmap, false); |
| } |
| |
| std::string EscapeQueryParamValue(StringPiece text, bool use_plus) { |
| return Escape(text, kQueryCharmap, use_plus); |
| } |
| |
| std::string EscapePath(StringPiece path) { |
| return Escape(path, kPathCharmap, false); |
| } |
| |
| #if BUILDFLAG(IS_APPLE) |
| std::string EscapeNSURLPrecursor(StringPiece precursor) { |
| return Escape(precursor, kNSURLCharmap, false, true); |
| } |
| #endif // BUILDFLAG(IS_APPLE) |
| |
| std::string EscapeUrlEncodedData(StringPiece path, bool use_plus) { |
| return Escape(path, kUrlEscape, use_plus); |
| } |
| |
| std::string EscapeNonASCIIAndPercent(StringPiece input) { |
| return Escape(input, kNonASCIICharmapAndPercent, false); |
| } |
| |
| std::string EscapeNonASCII(StringPiece input) { |
| return Escape(input, kNonASCIICharmap, false); |
| } |
| |
| std::string EscapeExternalHandlerValue(StringPiece text) { |
| return Escape(text, kExternalHandlerCharmap, false, true); |
| } |
| |
| void AppendEscapedCharForHTML(char c, std::string* output) { |
| AppendEscapedCharForHTMLImpl(c, output); |
| } |
| |
| std::string EscapeForHTML(StringPiece input) { |
| return EscapeForHTMLImpl(input); |
| } |
| |
| std::u16string EscapeForHTML(StringPiece16 input) { |
| return EscapeForHTMLImpl(input); |
| } |
| |
| std::string UnescapeURLComponent(StringPiece escaped_text, |
| UnescapeRule::Type rules) { |
| return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr); |
| } |
| |
| std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments( |
| StringPiece text, |
| UnescapeRule::Type rules, |
| OffsetAdjuster::Adjustments* adjustments) { |
| std::u16string result; |
| OffsetAdjuster::Adjustments unescape_adjustments; |
| std::string unescaped_url( |
| UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments)); |
| if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(), |
| &result, adjustments)) { |
| // Character set looks like it's valid. |
| if (adjustments) { |
| OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments, |
| adjustments); |
| } |
| return result; |
| } |
| // Character set is not valid. Return the escaped version. |
| return UTF8ToUTF16WithAdjustments(text, adjustments); |
| } |
| |
| std::string UnescapeBinaryURLComponent(StringPiece escaped_text, |
| UnescapeRule::Type rules) { |
| // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported. |
| DCHECK(rules != UnescapeRule::NONE); |
| DCHECK(!(rules & |
| ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE))); |
| |
| // It is not possible to read the feature state when this function is invoked |
| // before FeatureList initialization. In that case, fallback to the feature's |
| // default state. |
| // |
| // TODO(crbug.com/1321924): Cleanup this feature. |
| const bool optimize_data_urls_feature_is_enabled = |
| base::FeatureList::GetInstance() |
| ? base::FeatureList::IsEnabled(features::kOptimizeDataUrls) |
| : features::kOptimizeDataUrls.default_state == |
| base::FEATURE_ENABLED_BY_DEFAULT; |
| |
| // If there are no '%' characters in the string, there will be nothing to |
| // unescape, so we can take the fast path. |
| if (optimize_data_urls_feature_is_enabled && |
| escaped_text.find('%') == StringPiece::npos) { |
| std::string unescaped_text(escaped_text); |
| if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) |
| std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' '); |
| return unescaped_text; |
| } |
| |
| std::string unescaped_text; |
| |
| // The output of the unescaping is always smaller than the input, so we can |
| // reserve the input size to make sure we have enough buffer and don't have |
| // to allocate in the loop below. |
| // Increase capacity before size, as just resizing can grow capacity |
| // needlessly beyond our requested size. |
| unescaped_text.reserve(escaped_text.size()); |
| unescaped_text.resize(escaped_text.size()); |
| |
| size_t output_index = 0; |
| |
| for (size_t i = 0, max = escaped_text.size(); i < max;) { |
| unsigned char byte; |
| // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe |
| // to call. |
| if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) { |
| unescaped_text[output_index++] = static_cast<char>(byte); |
| i += 3; |
| continue; |
| } |
| |
| if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && |
| escaped_text[i] == '+') { |
| unescaped_text[output_index++] = ' '; |
| ++i; |
| continue; |
| } |
| |
| unescaped_text[output_index++] = escaped_text[i++]; |
| } |
| |
| DCHECK_LE(output_index, unescaped_text.size()); |
| unescaped_text.resize(output_index); |
| return unescaped_text; |
| } |
| |
| bool UnescapeBinaryURLComponentSafe(StringPiece escaped_text, |
| bool fail_on_path_separators, |
| std::string* unescaped_text) { |
| unescaped_text->clear(); |
| |
| std::set<unsigned char> illegal_encoded_bytes; |
| for (unsigned char c = '\x00'; c < '\x20'; ++c) { |
| illegal_encoded_bytes.insert(c); |
| } |
| if (fail_on_path_separators) { |
| illegal_encoded_bytes.insert('/'); |
| illegal_encoded_bytes.insert('\\'); |
| } |
| if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes)) |
| return false; |
| |
| *unescaped_text = UnescapeBinaryURLComponent(escaped_text); |
| return true; |
| } |
| |
| bool ContainsEncodedBytes(StringPiece escaped_text, |
| const std::set<unsigned char>& bytes) { |
| for (size_t i = 0, max = escaped_text.size(); i < max;) { |
| unsigned char byte; |
| // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe |
| // to call. |
| if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) { |
| if (bytes.find(byte) != bytes.end()) |
| return true; |
| |
| i += 3; |
| continue; |
| } |
| |
| ++i; |
| } |
| |
| return false; |
| } |
| |
| std::u16string UnescapeForHTML(StringPiece16 input) { |
| static const struct { |
| const char* ampersand_code; |
| const char16_t replacement; |
| } kEscapeToChars[] = { |
| {"<", '<'}, {">", '>'}, {"&", '&'}, |
| {""", '"'}, {"'", '\''}, |
| }; |
| constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars); |
| |
| if (input.find(u"&") == std::string::npos) |
| return std::u16string(input); |
| |
| std::u16string ampersand_chars[kEscapeToCharsCount]; |
| std::u16string text(input); |
| for (std::u16string::iterator iter = text.begin(); iter != text.end(); |
| ++iter) { |
| if (*iter == '&') { |
| // Potential ampersand encode char. |
| size_t index = static_cast<size_t>(iter - text.begin()); |
| for (size_t i = 0; i < std::size(kEscapeToChars); i++) { |
| if (ampersand_chars[i].empty()) { |
| ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code); |
| } |
| if (text.find(ampersand_chars[i], index) == index) { |
| text.replace( |
| iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()), |
| 1, kEscapeToChars[i].replacement); |
| break; |
| } |
| } |
| } |
| } |
| return text; |
| } |
| |
| } // namespace base |