blob: 9f315477960e83857c25845de33b72a1d172bbc8 [file] [log] [blame]
Andrew Top0d1858f2019-05-15 22:01:47 -07001// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/logging.h"
6#include "url/url_canon.h"
7#include "url/url_canon_internal.h"
8
9namespace url {
10
11namespace {
12
13// For reference, here's what IE supports:
14// Key: 0 (disallowed: failure if present in the input)
15// + (allowed either escaped or unescaped, and unmodified)
16// U (allowed escaped or unescaped but always unescaped if present in
17// escaped form)
18// E (allowed escaped or unescaped but always escaped if present in
19// unescaped form)
20// % (only allowed escaped in the input, will be unmodified).
21// I left blank alpha numeric characters.
22//
23// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
24// -----------------------------------------------
25// 0 0 E E E E E E E E E E E E E E E
26// 1 E E E E E E E E E E E E E E E E
27// 2 E + E E + E + + + + + + + U U 0
28// 3 % % E + E 0 <-- Those are : ; < = > ?
29// 4 %
30// 5 U 0 U U U <-- Those are [ \ ] ^ _
31// 6 E <-- That's `
32// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE)
33//
34// NOTE: I didn't actually test all the control characters. Some may be
35// disallowed in the input, but they are all accepted escaped except for 0.
36// I also didn't test if characters affecting HTML parsing are allowed
37// unescaped, e.g. (") or (#), which would indicate the beginning of the path.
38// Surprisingly, space is accepted in the input and always escaped.
39
40// This table lists the canonical version of all characters we allow in the
41// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
42// value to indicate that this character should be escaped. We are a little more
43// restrictive than IE, but less restrictive than Firefox.
44//
45// Note that we disallow the % character. We will allow it when part of an
46// escape sequence, of course, but this disallows "%25". Even though IE allows
47// it, allowing it would put us in a funny state. If there was an invalid
48// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
49// Allowing percents means we'll succeed a second time, so validity would change
50// based on how many times you run the canonicalizer. We prefer to always report
51// the same vailidity, so reject this.
52const unsigned char kEsc = 0xff;
53const unsigned char kHostCharLookup[0x80] = {
54// 00-1f: all are invalid
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57// ' ' ! " # $ % & ' ( ) * + , - . /
58 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
59// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
60 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
61// @ A B C D E F G H I J K L M N O
62 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
63// P Q R S T U V W X Y Z [ \ ] ^ _
64 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
65// ` a b c d e f g h i j k l m n o
66 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
67// p q r s t u v w x y z { | } ~
68 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
69
70// RFC1034 maximum FQDN length.
71constexpr int kMaxHostLength = 253;
72
73// Generous padding to account for the fact that UTS#46 normalization can cause
74// a long string to actually shrink and fit within the 253 character RFC1034
75// FQDN length limit. Note that this can still be too short for pathological
76// cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be
77// removed from the input by UTS#46 processing. However, this should be
78// sufficient for all normally-encountered, non-abusive hostname strings.
79constexpr int kMaxHostBufferLength = kMaxHostLength*5;
80
81const int kTempHostBufferLen = 1024;
82typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
83typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
84
85// Scans a host name and fills in the output flags according to what we find.
86// |has_non_ascii| will be true if there are any non-7-bit characters, and
87// |has_escaped| will be true if there is a percent sign.
88template<typename CHAR, typename UCHAR>
89void ScanHostname(const CHAR* spec,
90 const Component& host,
91 bool* has_non_ascii,
92 bool* has_escaped) {
93 int end = host.end();
94 *has_non_ascii = false;
95 *has_escaped = false;
96 for (int i = host.begin; i < end; i++) {
97 if (static_cast<UCHAR>(spec[i]) >= 0x80)
98 *has_non_ascii = true;
99 else if (spec[i] == '%')
100 *has_escaped = true;
101 }
102}
103
104// Canonicalizes a host name that is entirely 8-bit characters (even though
105// the type holding them may be 16 bits. Escaped characters will be unescaped.
106// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
107//
108// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
109// the output.
110//
111// This function is used in two situations:
112//
113// * When the caller knows there is no non-ASCII or percent escaped
114// characters. This is what DoHost does. The result will be a completely
115// canonicalized host since we know nothing weird can happen (escaped
116// characters could be unescaped to non-7-bit, so they have to be treated
117// with suspicion at this point). It does not use the |has_non_ascii| flag.
118//
119// * When the caller has an 8-bit string that may need unescaping.
120// DoComplexHost calls us this situation to do unescaping and validation.
121// After this, it may do other IDN operations depending on the value of the
122// |*has_non_ascii| flag.
123//
124// The return value indicates if the output is a potentially valid host name.
125template<typename INCHAR, typename OUTCHAR>
126bool DoSimpleHost(const INCHAR* host,
127 int host_len,
128 CanonOutputT<OUTCHAR>* output,
129 bool* has_non_ascii) {
130 *has_non_ascii = false;
131
132 bool success = true;
133 for (int i = 0; i < host_len; ++i) {
134 unsigned int source = host[i];
135 if (source == '%') {
136 // Unescape first, if possible.
137 // Source will be used only if decode operation was successful.
138 if (!DecodeEscaped(host, &i, host_len,
139 reinterpret_cast<unsigned char*>(&source))) {
140 // Invalid escaped character. There is nothing that can make this
141 // host valid. We append an escaped percent so the URL looks reasonable
142 // and mark as failed.
143 AppendEscapedChar('%', output);
144 success = false;
145 continue;
146 }
147 }
148
149 if (source < 0x80) {
150 // We have ASCII input, we can use our lookup table.
151 unsigned char replacement = kHostCharLookup[source];
152 if (!replacement) {
153 // Invalid character, add it as percent-escaped and mark as failed.
154 AppendEscapedChar(source, output);
155 success = false;
156 } else if (replacement == kEsc) {
157 // This character is valid but should be escaped.
158 AppendEscapedChar(source, output);
159 } else {
160 // Common case, the given character is valid in a hostname, the lookup
161 // table tells us the canonical representation of that character (lower
162 // cased).
163 output->push_back(replacement);
164 }
165 } else {
166 // It's a non-ascii char. Just push it to the output.
167 // In case where we have char16 input, and char output it's safe to
168 // cast char16->char only if input string was converted to ASCII.
169 output->push_back(static_cast<OUTCHAR>(source));
170 *has_non_ascii = true;
171 }
172 }
173 return success;
174}
175
176// Canonicalizes a host that requires IDN conversion. Returns true on success
177bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
178 int original_output_len = output->length(); // So we can rewind below.
179
180 // We need to escape URL before doing IDN conversion, since punicode strings
181 // cannot be escaped after they are created.
182 RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
183 bool has_non_ascii;
184 DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
185 if (url_escaped_host.length() > kMaxHostBufferLength) {
186 AppendInvalidNarrowString(src, 0, src_len, output);
187 return false;
188 }
189
190 StackBufferW wide_output;
191 if (!IDNToASCII(url_escaped_host.data(),
192 url_escaped_host.length(),
193 &wide_output)) {
194 // Some error, give up. This will write some reasonable looking
195 // representation of the string to the output.
196 AppendInvalidNarrowString(src, 0, src_len, output);
197 return false;
198 }
199
200 // Now we check the ASCII output like a normal host. It will also handle
201 // unescaping. Although we unescaped everything before this function call, if
202 // somebody does %00 as fullwidth, ICU will convert this to ASCII.
203 bool success = DoSimpleHost(wide_output.data(),
204 wide_output.length(),
205 output, &has_non_ascii);
206 if (has_non_ascii) {
207 // ICU generated something that DoSimpleHost didn't think looked like
208 // ASCII. This is quite rare, but ICU might convert some characters to
209 // percent signs which might generate new escape sequences which might in
210 // turn be invalid. An example is U+FE6A "small percent" which ICU will
211 // name prep into an ASCII percent and then we can interpret the following
212 // characters as escaped characters.
213 //
214 // If DoSimpleHost didn't think the output was ASCII, just escape the
215 // thing we gave ICU and give up. DoSimpleHost will have handled a further
216 // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
217 // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
218 // do more (like handle escaped non-ASCII sequences). Handling the escaped
219 // ASCII isn't strictly necessary, but DoSimpleHost handles this case
220 // anyway so we handle it/
221 output->set_length(original_output_len);
222 AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
223 output);
224 return false;
225 }
226 return success;
227}
228
229// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
230// UTF-16. The has_escaped flag should be set if the input string requires
231// unescaping.
232bool DoComplexHost(const char* host, int host_len,
233 bool has_non_ascii, bool has_escaped, CanonOutput* output) {
234 // Save the current position in the output. We may write stuff and rewind it
235 // below, so we need to know where to rewind to.
236 int begin_length = output->length();
237
238 // Points to the UTF-8 data we want to convert. This will either be the
239 // input or the unescaped version written to |*output| if necessary.
240 const char* utf8_source;
241 int utf8_source_len;
242 if (has_escaped) {
243 // Unescape before converting to UTF-16 for IDN. We write this into the
244 // output because it most likely does not require IDNization, and we can
245 // save another huge stack buffer. It will be replaced below if it requires
246 // IDN. This will also update our non-ASCII flag so we know whether the
247 // unescaped input requires IDN.
248 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
249 // Error with some escape sequence. We'll call the current output
250 // complete. DoSimpleHost will have written some "reasonable" output.
251 return false;
252 }
253
254 // Unescaping may have left us with ASCII input, in which case the
255 // unescaped version we wrote to output is complete.
256 if (!has_non_ascii) {
257 return true;
258 }
259
260 // Save the pointer into the data was just converted (it may be appended to
261 // other data in the output buffer).
262 utf8_source = &output->data()[begin_length];
263 utf8_source_len = output->length() - begin_length;
264 } else {
265 // We don't need to unescape, use input for IDNization later. (We know the
266 // input has non-ASCII, or the simple version would have been called
267 // instead of us.)
268 utf8_source = host;
269 utf8_source_len = host_len;
270 }
271
272 // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
273 // Above, we may have used the output to write the unescaped values to, so
274 // we have to rewind it to where we started after we convert it to UTF-16.
275 StackBufferW utf16;
276 if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
277 // In this error case, the input may or may not be the output.
278 StackBuffer utf8;
279 for (int i = 0; i < utf8_source_len; i++)
280 utf8.push_back(utf8_source[i]);
281 output->set_length(begin_length);
282 AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
283 return false;
284 }
285 output->set_length(begin_length);
286
287 // This will call DoSimpleHost which will do normal ASCII canonicalization
288 // and also check for IP addresses in the outpt.
289 return DoIDNHost(utf16.data(), utf16.length(), output);
290}
291
292// UTF-16 convert host to its ASCII version. The set up is already ready for
293// the backend, so we just pass through. The has_escaped flag should be set if
294// the input string requires unescaping.
295bool DoComplexHost(const base::char16* host, int host_len,
296 bool has_non_ascii, bool has_escaped, CanonOutput* output) {
297 if (has_escaped) {
298 // Yikes, we have escaped characters with wide input. The escaped
299 // characters should be interpreted as UTF-8. To solve this problem,
300 // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
301 //
302 // We don't bother to optimize the conversion in the ASCII case (which
303 // *could* just be a copy) and use the UTF-8 path, because it should be
304 // very rare that host names have escaped characters, and it is relatively
305 // fast to do the conversion anyway.
306 StackBuffer utf8;
307 if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
308 AppendInvalidNarrowString(host, 0, host_len, output);
309 return false;
310 }
311
312 // Once we convert to UTF-8, we can use the 8-bit version of the complex
313 // host handling code above.
314 return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
315 has_escaped, output);
316 }
317
318 // No unescaping necessary, we can safely pass the input to ICU. This
319 // function will only get called if we either have escaped or non-ascii
320 // input, so it's safe to just use ICU now. Even if the input is ASCII,
321 // this function will do the right thing (just slower than we could).
322 return DoIDNHost(host, host_len, output);
323}
324
325template <typename CHAR, typename UCHAR>
326bool DoHostSubstring(const CHAR* spec,
327 const Component& host,
328 CanonOutput* output) {
329 bool has_non_ascii, has_escaped;
330 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
331
332 if (has_non_ascii || has_escaped) {
333 return DoComplexHost(&spec[host.begin], host.len, has_non_ascii,
334 has_escaped, output);
335 }
336
337 const bool success =
338 DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii);
339 DCHECK(!has_non_ascii);
340 return success;
341}
342
343template <typename CHAR, typename UCHAR>
344void DoHost(const CHAR* spec,
345 const Component& host,
346 CanonOutput* output,
347 CanonHostInfo* host_info) {
348 if (host.len <= 0) {
349 // Empty hosts don't need anything.
350 host_info->family = CanonHostInfo::NEUTRAL;
351 host_info->out_host = Component();
352 return;
353 }
354
355 // Keep track of output's initial length, so we can rewind later.
356 const int output_begin = output->length();
357
358 if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) {
359 // After all the other canonicalization, check if we ended up with an IP
360 // address. IP addresses are small, so writing into this temporary buffer
361 // should not cause an allocation.
362 RawCanonOutput<64> canon_ip;
363 CanonicalizeIPAddress(output->data(),
364 MakeRange(output_begin, output->length()),
365 &canon_ip, host_info);
366
367 // If we got an IPv4/IPv6 address, copy the canonical form back to the
368 // real buffer. Otherwise, it's a hostname or broken IP, in which case
369 // we just leave it in place.
370 if (host_info->IsIPAddress()) {
371 output->set_length(output_begin);
372 output->Append(canon_ip.data(), canon_ip.length());
373 }
374 } else {
375 // Canonicalization failed. Set BROKEN to notify the caller.
376 host_info->family = CanonHostInfo::BROKEN;
377 }
378
379 host_info->out_host = MakeRange(output_begin, output->length());
380}
381
382} // namespace
383
384bool CanonicalizeHost(const char* spec,
385 const Component& host,
386 CanonOutput* output,
387 Component* out_host) {
388 CanonHostInfo host_info;
389 DoHost<char, unsigned char>(spec, host, output, &host_info);
390 *out_host = host_info.out_host;
391 return (host_info.family != CanonHostInfo::BROKEN);
392}
393
394bool CanonicalizeHost(const base::char16* spec,
395 const Component& host,
396 CanonOutput* output,
397 Component* out_host) {
398 CanonHostInfo host_info;
399 DoHost<base::char16, base::char16>(spec, host, output, &host_info);
400 *out_host = host_info.out_host;
401 return (host_info.family != CanonHostInfo::BROKEN);
402}
403
404void CanonicalizeHostVerbose(const char* spec,
405 const Component& host,
406 CanonOutput* output,
407 CanonHostInfo* host_info) {
408 DoHost<char, unsigned char>(spec, host, output, host_info);
409}
410
411void CanonicalizeHostVerbose(const base::char16* spec,
412 const Component& host,
413 CanonOutput* output,
414 CanonHostInfo* host_info) {
415 DoHost<base::char16, base::char16>(spec, host, output, host_info);
416}
417
418bool CanonicalizeHostSubstring(const char* spec,
419 const Component& host,
420 CanonOutput* output) {
421 return DoHostSubstring<char, unsigned char>(spec, host, output);
422}
423
424bool CanonicalizeHostSubstring(const base::char16* spec,
425 const Component& host,
426 CanonOutput* output) {
427 return DoHostSubstring<base::char16, base::char16>(spec, host, output);
428}
429
430} // namespace url