Andrew Top | 0d1858f | 2019-05-15 22:01:47 -0700 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "base/logging.h" |
| 6 | #include "url/url_canon.h" |
| 7 | #include "url/url_canon_internal.h" |
| 8 | |
| 9 | namespace url { |
| 10 | |
| 11 | namespace { |
| 12 | |
| 13 | // For reference, here's what IE supports: |
| 14 | // Key: 0 (disallowed: failure if present in the input) |
| 15 | // + (allowed either escaped or unescaped, and unmodified) |
| 16 | // U (allowed escaped or unescaped but always unescaped if present in |
| 17 | // escaped form) |
| 18 | // E (allowed escaped or unescaped but always escaped if present in |
| 19 | // unescaped form) |
| 20 | // % (only allowed escaped in the input, will be unmodified). |
| 21 | // I left blank alpha numeric characters. |
| 22 | // |
| 23 | // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f |
| 24 | // ----------------------------------------------- |
| 25 | // 0 0 E E E E E E E E E E E E E E E |
| 26 | // 1 E E E E E E E E E E E E E E E E |
| 27 | // 2 E + E E + E + + + + + + + U U 0 |
| 28 | // 3 % % E + E 0 <-- Those are : ; < = > ? |
| 29 | // 4 % |
| 30 | // 5 U 0 U U U <-- Those are [ \ ] ^ _ |
| 31 | // 6 E <-- That's ` |
| 32 | // 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE) |
| 33 | // |
| 34 | // NOTE: I didn't actually test all the control characters. Some may be |
| 35 | // disallowed in the input, but they are all accepted escaped except for 0. |
| 36 | // I also didn't test if characters affecting HTML parsing are allowed |
| 37 | // unescaped, e.g. (") or (#), which would indicate the beginning of the path. |
| 38 | // Surprisingly, space is accepted in the input and always escaped. |
| 39 | |
| 40 | // This table lists the canonical version of all characters we allow in the |
| 41 | // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar |
| 42 | // value to indicate that this character should be escaped. We are a little more |
| 43 | // restrictive than IE, but less restrictive than Firefox. |
| 44 | // |
| 45 | // Note that we disallow the % character. We will allow it when part of an |
| 46 | // escape sequence, of course, but this disallows "%25". Even though IE allows |
| 47 | // it, allowing it would put us in a funny state. If there was an invalid |
| 48 | // escape sequence like "%zz", we'll add "%25zz" to the output and fail. |
| 49 | // Allowing percents means we'll succeed a second time, so validity would change |
| 50 | // based on how many times you run the canonicalizer. We prefer to always report |
| 51 | // the same vailidity, so reject this. |
| 52 | const unsigned char kEsc = 0xff; |
| 53 | const unsigned char kHostCharLookup[0x80] = { |
| 54 | // 00-1f: all are invalid |
| 55 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 56 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 57 | // ' ' ! " # $ % & ' ( ) * + , - . / |
| 58 | kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0, |
| 59 | // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? |
| 60 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 , |
| 61 | // @ A B C D E F G H I J K L M N O |
| 62 | kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', |
| 63 | // P Q R S T U V W X Y Z [ \ ] ^ _ |
| 64 | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_', |
| 65 | // ` a b c d e f g h i j k l m n o |
| 66 | kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', |
| 67 | // p q r s t u v w x y z { | } ~ |
| 68 | 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 }; |
| 69 | |
| 70 | // RFC1034 maximum FQDN length. |
| 71 | constexpr int kMaxHostLength = 253; |
| 72 | |
| 73 | // Generous padding to account for the fact that UTS#46 normalization can cause |
| 74 | // a long string to actually shrink and fit within the 253 character RFC1034 |
| 75 | // FQDN length limit. Note that this can still be too short for pathological |
| 76 | // cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be |
| 77 | // removed from the input by UTS#46 processing. However, this should be |
| 78 | // sufficient for all normally-encountered, non-abusive hostname strings. |
| 79 | constexpr int kMaxHostBufferLength = kMaxHostLength*5; |
| 80 | |
| 81 | const int kTempHostBufferLen = 1024; |
| 82 | typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer; |
| 83 | typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW; |
| 84 | |
| 85 | // Scans a host name and fills in the output flags according to what we find. |
| 86 | // |has_non_ascii| will be true if there are any non-7-bit characters, and |
| 87 | // |has_escaped| will be true if there is a percent sign. |
| 88 | template<typename CHAR, typename UCHAR> |
| 89 | void ScanHostname(const CHAR* spec, |
| 90 | const Component& host, |
| 91 | bool* has_non_ascii, |
| 92 | bool* has_escaped) { |
| 93 | int end = host.end(); |
| 94 | *has_non_ascii = false; |
| 95 | *has_escaped = false; |
| 96 | for (int i = host.begin; i < end; i++) { |
| 97 | if (static_cast<UCHAR>(spec[i]) >= 0x80) |
| 98 | *has_non_ascii = true; |
| 99 | else if (spec[i] == '%') |
| 100 | *has_escaped = true; |
| 101 | } |
| 102 | } |
| 103 | |
| 104 | // Canonicalizes a host name that is entirely 8-bit characters (even though |
| 105 | // the type holding them may be 16 bits. Escaped characters will be unescaped. |
| 106 | // Non-7-bit characters (for example, UTF-8) will be passed unchanged. |
| 107 | // |
| 108 | // The |*has_non_ascii| flag will be true if there are non-7-bit characters in |
| 109 | // the output. |
| 110 | // |
| 111 | // This function is used in two situations: |
| 112 | // |
| 113 | // * When the caller knows there is no non-ASCII or percent escaped |
| 114 | // characters. This is what DoHost does. The result will be a completely |
| 115 | // canonicalized host since we know nothing weird can happen (escaped |
| 116 | // characters could be unescaped to non-7-bit, so they have to be treated |
| 117 | // with suspicion at this point). It does not use the |has_non_ascii| flag. |
| 118 | // |
| 119 | // * When the caller has an 8-bit string that may need unescaping. |
| 120 | // DoComplexHost calls us this situation to do unescaping and validation. |
| 121 | // After this, it may do other IDN operations depending on the value of the |
| 122 | // |*has_non_ascii| flag. |
| 123 | // |
| 124 | // The return value indicates if the output is a potentially valid host name. |
| 125 | template<typename INCHAR, typename OUTCHAR> |
| 126 | bool DoSimpleHost(const INCHAR* host, |
| 127 | int host_len, |
| 128 | CanonOutputT<OUTCHAR>* output, |
| 129 | bool* has_non_ascii) { |
| 130 | *has_non_ascii = false; |
| 131 | |
| 132 | bool success = true; |
| 133 | for (int i = 0; i < host_len; ++i) { |
| 134 | unsigned int source = host[i]; |
| 135 | if (source == '%') { |
| 136 | // Unescape first, if possible. |
| 137 | // Source will be used only if decode operation was successful. |
| 138 | if (!DecodeEscaped(host, &i, host_len, |
| 139 | reinterpret_cast<unsigned char*>(&source))) { |
| 140 | // Invalid escaped character. There is nothing that can make this |
| 141 | // host valid. We append an escaped percent so the URL looks reasonable |
| 142 | // and mark as failed. |
| 143 | AppendEscapedChar('%', output); |
| 144 | success = false; |
| 145 | continue; |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | if (source < 0x80) { |
| 150 | // We have ASCII input, we can use our lookup table. |
| 151 | unsigned char replacement = kHostCharLookup[source]; |
| 152 | if (!replacement) { |
| 153 | // Invalid character, add it as percent-escaped and mark as failed. |
| 154 | AppendEscapedChar(source, output); |
| 155 | success = false; |
| 156 | } else if (replacement == kEsc) { |
| 157 | // This character is valid but should be escaped. |
| 158 | AppendEscapedChar(source, output); |
| 159 | } else { |
| 160 | // Common case, the given character is valid in a hostname, the lookup |
| 161 | // table tells us the canonical representation of that character (lower |
| 162 | // cased). |
| 163 | output->push_back(replacement); |
| 164 | } |
| 165 | } else { |
| 166 | // It's a non-ascii char. Just push it to the output. |
| 167 | // In case where we have char16 input, and char output it's safe to |
| 168 | // cast char16->char only if input string was converted to ASCII. |
| 169 | output->push_back(static_cast<OUTCHAR>(source)); |
| 170 | *has_non_ascii = true; |
| 171 | } |
| 172 | } |
| 173 | return success; |
| 174 | } |
| 175 | |
| 176 | // Canonicalizes a host that requires IDN conversion. Returns true on success |
| 177 | bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) { |
| 178 | int original_output_len = output->length(); // So we can rewind below. |
| 179 | |
| 180 | // We need to escape URL before doing IDN conversion, since punicode strings |
| 181 | // cannot be escaped after they are created. |
| 182 | RawCanonOutputW<kTempHostBufferLen> url_escaped_host; |
| 183 | bool has_non_ascii; |
| 184 | DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii); |
| 185 | if (url_escaped_host.length() > kMaxHostBufferLength) { |
| 186 | AppendInvalidNarrowString(src, 0, src_len, output); |
| 187 | return false; |
| 188 | } |
| 189 | |
| 190 | StackBufferW wide_output; |
| 191 | if (!IDNToASCII(url_escaped_host.data(), |
| 192 | url_escaped_host.length(), |
| 193 | &wide_output)) { |
| 194 | // Some error, give up. This will write some reasonable looking |
| 195 | // representation of the string to the output. |
| 196 | AppendInvalidNarrowString(src, 0, src_len, output); |
| 197 | return false; |
| 198 | } |
| 199 | |
| 200 | // Now we check the ASCII output like a normal host. It will also handle |
| 201 | // unescaping. Although we unescaped everything before this function call, if |
| 202 | // somebody does %00 as fullwidth, ICU will convert this to ASCII. |
| 203 | bool success = DoSimpleHost(wide_output.data(), |
| 204 | wide_output.length(), |
| 205 | output, &has_non_ascii); |
| 206 | if (has_non_ascii) { |
| 207 | // ICU generated something that DoSimpleHost didn't think looked like |
| 208 | // ASCII. This is quite rare, but ICU might convert some characters to |
| 209 | // percent signs which might generate new escape sequences which might in |
| 210 | // turn be invalid. An example is U+FE6A "small percent" which ICU will |
| 211 | // name prep into an ASCII percent and then we can interpret the following |
| 212 | // characters as escaped characters. |
| 213 | // |
| 214 | // If DoSimpleHost didn't think the output was ASCII, just escape the |
| 215 | // thing we gave ICU and give up. DoSimpleHost will have handled a further |
| 216 | // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates |
| 217 | // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't |
| 218 | // do more (like handle escaped non-ASCII sequences). Handling the escaped |
| 219 | // ASCII isn't strictly necessary, but DoSimpleHost handles this case |
| 220 | // anyway so we handle it/ |
| 221 | output->set_length(original_output_len); |
| 222 | AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(), |
| 223 | output); |
| 224 | return false; |
| 225 | } |
| 226 | return success; |
| 227 | } |
| 228 | |
| 229 | // 8-bit convert host to its ASCII version: this converts the UTF-8 input to |
| 230 | // UTF-16. The has_escaped flag should be set if the input string requires |
| 231 | // unescaping. |
| 232 | bool DoComplexHost(const char* host, int host_len, |
| 233 | bool has_non_ascii, bool has_escaped, CanonOutput* output) { |
| 234 | // Save the current position in the output. We may write stuff and rewind it |
| 235 | // below, so we need to know where to rewind to. |
| 236 | int begin_length = output->length(); |
| 237 | |
| 238 | // Points to the UTF-8 data we want to convert. This will either be the |
| 239 | // input or the unescaped version written to |*output| if necessary. |
| 240 | const char* utf8_source; |
| 241 | int utf8_source_len; |
| 242 | if (has_escaped) { |
| 243 | // Unescape before converting to UTF-16 for IDN. We write this into the |
| 244 | // output because it most likely does not require IDNization, and we can |
| 245 | // save another huge stack buffer. It will be replaced below if it requires |
| 246 | // IDN. This will also update our non-ASCII flag so we know whether the |
| 247 | // unescaped input requires IDN. |
| 248 | if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { |
| 249 | // Error with some escape sequence. We'll call the current output |
| 250 | // complete. DoSimpleHost will have written some "reasonable" output. |
| 251 | return false; |
| 252 | } |
| 253 | |
| 254 | // Unescaping may have left us with ASCII input, in which case the |
| 255 | // unescaped version we wrote to output is complete. |
| 256 | if (!has_non_ascii) { |
| 257 | return true; |
| 258 | } |
| 259 | |
| 260 | // Save the pointer into the data was just converted (it may be appended to |
| 261 | // other data in the output buffer). |
| 262 | utf8_source = &output->data()[begin_length]; |
| 263 | utf8_source_len = output->length() - begin_length; |
| 264 | } else { |
| 265 | // We don't need to unescape, use input for IDNization later. (We know the |
| 266 | // input has non-ASCII, or the simple version would have been called |
| 267 | // instead of us.) |
| 268 | utf8_source = host; |
| 269 | utf8_source_len = host_len; |
| 270 | } |
| 271 | |
| 272 | // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion. |
| 273 | // Above, we may have used the output to write the unescaped values to, so |
| 274 | // we have to rewind it to where we started after we convert it to UTF-16. |
| 275 | StackBufferW utf16; |
| 276 | if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) { |
| 277 | // In this error case, the input may or may not be the output. |
| 278 | StackBuffer utf8; |
| 279 | for (int i = 0; i < utf8_source_len; i++) |
| 280 | utf8.push_back(utf8_source[i]); |
| 281 | output->set_length(begin_length); |
| 282 | AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output); |
| 283 | return false; |
| 284 | } |
| 285 | output->set_length(begin_length); |
| 286 | |
| 287 | // This will call DoSimpleHost which will do normal ASCII canonicalization |
| 288 | // and also check for IP addresses in the outpt. |
| 289 | return DoIDNHost(utf16.data(), utf16.length(), output); |
| 290 | } |
| 291 | |
| 292 | // UTF-16 convert host to its ASCII version. The set up is already ready for |
| 293 | // the backend, so we just pass through. The has_escaped flag should be set if |
| 294 | // the input string requires unescaping. |
| 295 | bool DoComplexHost(const base::char16* host, int host_len, |
| 296 | bool has_non_ascii, bool has_escaped, CanonOutput* output) { |
| 297 | if (has_escaped) { |
| 298 | // Yikes, we have escaped characters with wide input. The escaped |
| 299 | // characters should be interpreted as UTF-8. To solve this problem, |
| 300 | // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN. |
| 301 | // |
| 302 | // We don't bother to optimize the conversion in the ASCII case (which |
| 303 | // *could* just be a copy) and use the UTF-8 path, because it should be |
| 304 | // very rare that host names have escaped characters, and it is relatively |
| 305 | // fast to do the conversion anyway. |
| 306 | StackBuffer utf8; |
| 307 | if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) { |
| 308 | AppendInvalidNarrowString(host, 0, host_len, output); |
| 309 | return false; |
| 310 | } |
| 311 | |
| 312 | // Once we convert to UTF-8, we can use the 8-bit version of the complex |
| 313 | // host handling code above. |
| 314 | return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, |
| 315 | has_escaped, output); |
| 316 | } |
| 317 | |
| 318 | // No unescaping necessary, we can safely pass the input to ICU. This |
| 319 | // function will only get called if we either have escaped or non-ascii |
| 320 | // input, so it's safe to just use ICU now. Even if the input is ASCII, |
| 321 | // this function will do the right thing (just slower than we could). |
| 322 | return DoIDNHost(host, host_len, output); |
| 323 | } |
| 324 | |
| 325 | template <typename CHAR, typename UCHAR> |
| 326 | bool DoHostSubstring(const CHAR* spec, |
| 327 | const Component& host, |
| 328 | CanonOutput* output) { |
| 329 | bool has_non_ascii, has_escaped; |
| 330 | ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); |
| 331 | |
| 332 | if (has_non_ascii || has_escaped) { |
| 333 | return DoComplexHost(&spec[host.begin], host.len, has_non_ascii, |
| 334 | has_escaped, output); |
| 335 | } |
| 336 | |
| 337 | const bool success = |
| 338 | DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii); |
| 339 | DCHECK(!has_non_ascii); |
| 340 | return success; |
| 341 | } |
| 342 | |
| 343 | template <typename CHAR, typename UCHAR> |
| 344 | void DoHost(const CHAR* spec, |
| 345 | const Component& host, |
| 346 | CanonOutput* output, |
| 347 | CanonHostInfo* host_info) { |
| 348 | if (host.len <= 0) { |
| 349 | // Empty hosts don't need anything. |
| 350 | host_info->family = CanonHostInfo::NEUTRAL; |
| 351 | host_info->out_host = Component(); |
| 352 | return; |
| 353 | } |
| 354 | |
| 355 | // Keep track of output's initial length, so we can rewind later. |
| 356 | const int output_begin = output->length(); |
| 357 | |
| 358 | if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) { |
| 359 | // After all the other canonicalization, check if we ended up with an IP |
| 360 | // address. IP addresses are small, so writing into this temporary buffer |
| 361 | // should not cause an allocation. |
| 362 | RawCanonOutput<64> canon_ip; |
| 363 | CanonicalizeIPAddress(output->data(), |
| 364 | MakeRange(output_begin, output->length()), |
| 365 | &canon_ip, host_info); |
| 366 | |
| 367 | // If we got an IPv4/IPv6 address, copy the canonical form back to the |
| 368 | // real buffer. Otherwise, it's a hostname or broken IP, in which case |
| 369 | // we just leave it in place. |
| 370 | if (host_info->IsIPAddress()) { |
| 371 | output->set_length(output_begin); |
| 372 | output->Append(canon_ip.data(), canon_ip.length()); |
| 373 | } |
| 374 | } else { |
| 375 | // Canonicalization failed. Set BROKEN to notify the caller. |
| 376 | host_info->family = CanonHostInfo::BROKEN; |
| 377 | } |
| 378 | |
| 379 | host_info->out_host = MakeRange(output_begin, output->length()); |
| 380 | } |
| 381 | |
| 382 | } // namespace |
| 383 | |
| 384 | bool CanonicalizeHost(const char* spec, |
| 385 | const Component& host, |
| 386 | CanonOutput* output, |
| 387 | Component* out_host) { |
| 388 | CanonHostInfo host_info; |
| 389 | DoHost<char, unsigned char>(spec, host, output, &host_info); |
| 390 | *out_host = host_info.out_host; |
| 391 | return (host_info.family != CanonHostInfo::BROKEN); |
| 392 | } |
| 393 | |
| 394 | bool CanonicalizeHost(const base::char16* spec, |
| 395 | const Component& host, |
| 396 | CanonOutput* output, |
| 397 | Component* out_host) { |
| 398 | CanonHostInfo host_info; |
| 399 | DoHost<base::char16, base::char16>(spec, host, output, &host_info); |
| 400 | *out_host = host_info.out_host; |
| 401 | return (host_info.family != CanonHostInfo::BROKEN); |
| 402 | } |
| 403 | |
| 404 | void CanonicalizeHostVerbose(const char* spec, |
| 405 | const Component& host, |
| 406 | CanonOutput* output, |
| 407 | CanonHostInfo* host_info) { |
| 408 | DoHost<char, unsigned char>(spec, host, output, host_info); |
| 409 | } |
| 410 | |
| 411 | void CanonicalizeHostVerbose(const base::char16* spec, |
| 412 | const Component& host, |
| 413 | CanonOutput* output, |
| 414 | CanonHostInfo* host_info) { |
| 415 | DoHost<base::char16, base::char16>(spec, host, output, host_info); |
| 416 | } |
| 417 | |
| 418 | bool CanonicalizeHostSubstring(const char* spec, |
| 419 | const Component& host, |
| 420 | CanonOutput* output) { |
| 421 | return DoHostSubstring<char, unsigned char>(spec, host, output); |
| 422 | } |
| 423 | |
| 424 | bool CanonicalizeHostSubstring(const base::char16* spec, |
| 425 | const Component& host, |
| 426 | CanonOutput* output) { |
| 427 | return DoHostSubstring<base::char16, base::char16>(spec, host, output); |
| 428 | } |
| 429 | |
| 430 | } // namespace url |