Blame - src/url/url_canon_host.cc - cobalt

blob: 9f315477960e83857c25845de33b72a1d172bbc8 [file] [log] [blame]

Andrew Top	0d1858f	2019-05-15 22:01:47 -0700	[diff] [blame]	1	// Copyright 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "base/logging.h"
				6	#include "url/url_canon.h"
				7	#include "url/url_canon_internal.h"
				8
				9	namespace url {
				10
				11	namespace {
				12
				13	// For reference, here's what IE supports:
				14	// Key: 0 (disallowed: failure if present in the input)
				15	// + (allowed either escaped or unescaped, and unmodified)
				16	// U (allowed escaped or unescaped but always unescaped if present in
				17	// escaped form)
				18	// E (allowed escaped or unescaped but always escaped if present in
				19	// unescaped form)
				20	// % (only allowed escaped in the input, will be unmodified).
				21	// I left blank alpha numeric characters.
				22	//
				23	// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
				24	// -----------------------------------------------
				25	// 0 0 E E E E E E E E E E E E E E E
				26	// 1 E E E E E E E E E E E E E E E E
				27	// 2 E + E E + E + + + + + + + U U 0
				28	// 3 % % E + E 0 <-- Those are : ; < = > ?
				29	// 4 %
				30	// 5 U 0 U U U <-- Those are [ \ ] ^ _
				31	// 6 E <-- That's `
				32	// 7 E E E U E <-- Those are { \| } ~ (UNPRINTABLE)
				33	//
				34	// NOTE: I didn't actually test all the control characters. Some may be
				35	// disallowed in the input, but they are all accepted escaped except for 0.
				36	// I also didn't test if characters affecting HTML parsing are allowed
				37	// unescaped, e.g. (") or (#), which would indicate the beginning of the path.
				38	// Surprisingly, space is accepted in the input and always escaped.
				39
				40	// This table lists the canonical version of all characters we allow in the
				41	// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
				42	// value to indicate that this character should be escaped. We are a little more
				43	// restrictive than IE, but less restrictive than Firefox.
				44	//
				45	// Note that we disallow the % character. We will allow it when part of an
				46	// escape sequence, of course, but this disallows "%25". Even though IE allows
				47	// it, allowing it would put us in a funny state. If there was an invalid
				48	// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
				49	// Allowing percents means we'll succeed a second time, so validity would change
				50	// based on how many times you run the canonicalizer. We prefer to always report
				51	// the same vailidity, so reject this.
				52	const unsigned char kEsc = 0xff;
				53	const unsigned char kHostCharLookup[0x80] = {
				54	// 00-1f: all are invalid
				55	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				56	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				57	// ' ' ! " # $ % & ' ( ) * + , - . /
				58	kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
				59	// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
				60	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
				61	// @ A B C D E F G H I J K L M N O
				62	kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
				63	// P Q R S T U V W X Y Z [ \ ] ^ _
				64	'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
				65	// ` a b c d e f g h i j k l m n o
				66	kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
				67	// p q r s t u v w x y z { \| } ~
				68	'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
				69
				70	// RFC1034 maximum FQDN length.
				71	constexpr int kMaxHostLength = 253;
				72
				73	// Generous padding to account for the fact that UTS#46 normalization can cause
				74	// a long string to actually shrink and fit within the 253 character RFC1034
				75	// FQDN length limit. Note that this can still be too short for pathological
				76	// cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be
				77	// removed from the input by UTS#46 processing. However, this should be
				78	// sufficient for all normally-encountered, non-abusive hostname strings.
				79	constexpr int kMaxHostBufferLength = kMaxHostLength*5;
				80
				81	const int kTempHostBufferLen = 1024;
				82	typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
				83	typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
				84
				85	// Scans a host name and fills in the output flags according to what we find.
				86	// \|has_non_ascii\| will be true if there are any non-7-bit characters, and
				87	// \|has_escaped\| will be true if there is a percent sign.
				88	template<typename CHAR, typename UCHAR>
				89	void ScanHostname(const CHAR* spec,
				90	const Component& host,
				91	bool* has_non_ascii,
				92	bool* has_escaped) {
				93	int end = host.end();
				94	*has_non_ascii = false;
				95	*has_escaped = false;
				96	for (int i = host.begin; i < end; i++) {
				97	if (static_cast<UCHAR>(spec[i]) >= 0x80)
				98	*has_non_ascii = true;
				99	else if (spec[i] == '%')
				100	*has_escaped = true;
				101	}
				102	}
				103
				104	// Canonicalizes a host name that is entirely 8-bit characters (even though
				105	// the type holding them may be 16 bits. Escaped characters will be unescaped.
				106	// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
				107	//
				108	// The \|*has_non_ascii\| flag will be true if there are non-7-bit characters in
				109	// the output.
				110	//
				111	// This function is used in two situations:
				112	//
				113	// * When the caller knows there is no non-ASCII or percent escaped
				114	// characters. This is what DoHost does. The result will be a completely
				115	// canonicalized host since we know nothing weird can happen (escaped
				116	// characters could be unescaped to non-7-bit, so they have to be treated
				117	// with suspicion at this point). It does not use the \|has_non_ascii\| flag.
				118	//
				119	// * When the caller has an 8-bit string that may need unescaping.
				120	// DoComplexHost calls us this situation to do unescaping and validation.
				121	// After this, it may do other IDN operations depending on the value of the
				122	// \|*has_non_ascii\| flag.
				123	//
				124	// The return value indicates if the output is a potentially valid host name.
				125	template<typename INCHAR, typename OUTCHAR>
				126	bool DoSimpleHost(const INCHAR* host,
				127	int host_len,
				128	CanonOutputT<OUTCHAR>* output,
				129	bool* has_non_ascii) {
				130	*has_non_ascii = false;
				131
				132	bool success = true;
				133	for (int i = 0; i < host_len; ++i) {
				134	unsigned int source = host[i];
				135	if (source == '%') {
				136	// Unescape first, if possible.
				137	// Source will be used only if decode operation was successful.
				138	if (!DecodeEscaped(host, &i, host_len,
				139	reinterpret_cast<unsigned char*>(&source))) {
				140	// Invalid escaped character. There is nothing that can make this
				141	// host valid. We append an escaped percent so the URL looks reasonable
				142	// and mark as failed.
				143	AppendEscapedChar('%', output);
				144	success = false;
				145	continue;
				146	}
				147	}
				148
				149	if (source < 0x80) {
				150	// We have ASCII input, we can use our lookup table.
				151	unsigned char replacement = kHostCharLookup[source];
				152	if (!replacement) {
				153	// Invalid character, add it as percent-escaped and mark as failed.
				154	AppendEscapedChar(source, output);
				155	success = false;
				156	} else if (replacement == kEsc) {
				157	// This character is valid but should be escaped.
				158	AppendEscapedChar(source, output);
				159	} else {
				160	// Common case, the given character is valid in a hostname, the lookup
				161	// table tells us the canonical representation of that character (lower
				162	// cased).
				163	output->push_back(replacement);
				164	}
				165	} else {
				166	// It's a non-ascii char. Just push it to the output.
				167	// In case where we have char16 input, and char output it's safe to
				168	// cast char16->char only if input string was converted to ASCII.
				169	output->push_back(static_cast<OUTCHAR>(source));
				170	*has_non_ascii = true;
				171	}
				172	}
				173	return success;
				174	}
				175
				176	// Canonicalizes a host that requires IDN conversion. Returns true on success
				177	bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
				178	int original_output_len = output->length(); // So we can rewind below.
				179
				180	// We need to escape URL before doing IDN conversion, since punicode strings
				181	// cannot be escaped after they are created.
				182	RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
				183	bool has_non_ascii;
				184	DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
				185	if (url_escaped_host.length() > kMaxHostBufferLength) {
				186	AppendInvalidNarrowString(src, 0, src_len, output);
				187	return false;
				188	}
				189
				190	StackBufferW wide_output;
				191	if (!IDNToASCII(url_escaped_host.data(),
				192	url_escaped_host.length(),
				193	&wide_output)) {
				194	// Some error, give up. This will write some reasonable looking
				195	// representation of the string to the output.
				196	AppendInvalidNarrowString(src, 0, src_len, output);
				197	return false;
				198	}
				199
				200	// Now we check the ASCII output like a normal host. It will also handle
				201	// unescaping. Although we unescaped everything before this function call, if
				202	// somebody does %00 as fullwidth, ICU will convert this to ASCII.
				203	bool success = DoSimpleHost(wide_output.data(),
				204	wide_output.length(),
				205	output, &has_non_ascii);
				206	if (has_non_ascii) {
				207	// ICU generated something that DoSimpleHost didn't think looked like
				208	// ASCII. This is quite rare, but ICU might convert some characters to
				209	// percent signs which might generate new escape sequences which might in
				210	// turn be invalid. An example is U+FE6A "small percent" which ICU will
				211	// name prep into an ASCII percent and then we can interpret the following
				212	// characters as escaped characters.
				213	//
				214	// If DoSimpleHost didn't think the output was ASCII, just escape the
				215	// thing we gave ICU and give up. DoSimpleHost will have handled a further
				216	// level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
				217	// a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
				218	// do more (like handle escaped non-ASCII sequences). Handling the escaped
				219	// ASCII isn't strictly necessary, but DoSimpleHost handles this case
				220	// anyway so we handle it/
				221	output->set_length(original_output_len);
				222	AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
				223	output);
				224	return false;
				225	}
				226	return success;
				227	}
				228
				229	// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
				230	// UTF-16. The has_escaped flag should be set if the input string requires
				231	// unescaping.
				232	bool DoComplexHost(const char* host, int host_len,
				233	bool has_non_ascii, bool has_escaped, CanonOutput* output) {
				234	// Save the current position in the output. We may write stuff and rewind it
				235	// below, so we need to know where to rewind to.
				236	int begin_length = output->length();
				237
				238	// Points to the UTF-8 data we want to convert. This will either be the
				239	// input or the unescaped version written to \|*output\| if necessary.
				240	const char* utf8_source;
				241	int utf8_source_len;
				242	if (has_escaped) {
				243	// Unescape before converting to UTF-16 for IDN. We write this into the
				244	// output because it most likely does not require IDNization, and we can
				245	// save another huge stack buffer. It will be replaced below if it requires
				246	// IDN. This will also update our non-ASCII flag so we know whether the
				247	// unescaped input requires IDN.
				248	if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
				249	// Error with some escape sequence. We'll call the current output
				250	// complete. DoSimpleHost will have written some "reasonable" output.
				251	return false;
				252	}
				253
				254	// Unescaping may have left us with ASCII input, in which case the
				255	// unescaped version we wrote to output is complete.
				256	if (!has_non_ascii) {
				257	return true;
				258	}
				259
				260	// Save the pointer into the data was just converted (it may be appended to
				261	// other data in the output buffer).
				262	utf8_source = &output->data()[begin_length];
				263	utf8_source_len = output->length() - begin_length;
				264	} else {
				265	// We don't need to unescape, use input for IDNization later. (We know the
				266	// input has non-ASCII, or the simple version would have been called
				267	// instead of us.)
				268	utf8_source = host;
				269	utf8_source_len = host_len;
				270	}
				271
				272	// Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
				273	// Above, we may have used the output to write the unescaped values to, so
				274	// we have to rewind it to where we started after we convert it to UTF-16.
				275	StackBufferW utf16;
				276	if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
				277	// In this error case, the input may or may not be the output.
				278	StackBuffer utf8;
				279	for (int i = 0; i < utf8_source_len; i++)
				280	utf8.push_back(utf8_source[i]);
				281	output->set_length(begin_length);
				282	AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
				283	return false;
				284	}
				285	output->set_length(begin_length);
				286
				287	// This will call DoSimpleHost which will do normal ASCII canonicalization
				288	// and also check for IP addresses in the outpt.
				289	return DoIDNHost(utf16.data(), utf16.length(), output);
				290	}
				291
				292	// UTF-16 convert host to its ASCII version. The set up is already ready for
				293	// the backend, so we just pass through. The has_escaped flag should be set if
				294	// the input string requires unescaping.
				295	bool DoComplexHost(const base::char16* host, int host_len,
				296	bool has_non_ascii, bool has_escaped, CanonOutput* output) {
				297	if (has_escaped) {
				298	// Yikes, we have escaped characters with wide input. The escaped
				299	// characters should be interpreted as UTF-8. To solve this problem,
				300	// we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
				301	//
				302	// We don't bother to optimize the conversion in the ASCII case (which
				303	// could just be a copy) and use the UTF-8 path, because it should be
				304	// very rare that host names have escaped characters, and it is relatively
				305	// fast to do the conversion anyway.
				306	StackBuffer utf8;
				307	if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
				308	AppendInvalidNarrowString(host, 0, host_len, output);
				309	return false;
				310	}
				311
				312	// Once we convert to UTF-8, we can use the 8-bit version of the complex
				313	// host handling code above.
				314	return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
				315	has_escaped, output);
				316	}
				317
				318	// No unescaping necessary, we can safely pass the input to ICU. This
				319	// function will only get called if we either have escaped or non-ascii
				320	// input, so it's safe to just use ICU now. Even if the input is ASCII,
				321	// this function will do the right thing (just slower than we could).
				322	return DoIDNHost(host, host_len, output);
				323	}
				324
				325	template <typename CHAR, typename UCHAR>
				326	bool DoHostSubstring(const CHAR* spec,
				327	const Component& host,
				328	CanonOutput* output) {
				329	bool has_non_ascii, has_escaped;
				330	ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
				331
				332	if (has_non_ascii \|\| has_escaped) {
				333	return DoComplexHost(&spec[host.begin], host.len, has_non_ascii,
				334	has_escaped, output);
				335	}
				336
				337	const bool success =
				338	DoSimpleHost(&spec[host.begin], host.len, output, &has_non_ascii);
				339	DCHECK(!has_non_ascii);
				340	return success;
				341	}
				342
				343	template <typename CHAR, typename UCHAR>
				344	void DoHost(const CHAR* spec,
				345	const Component& host,
				346	CanonOutput* output,
				347	CanonHostInfo* host_info) {
				348	if (host.len <= 0) {
				349	// Empty hosts don't need anything.
				350	host_info->family = CanonHostInfo::NEUTRAL;
				351	host_info->out_host = Component();
				352	return;
				353	}
				354
				355	// Keep track of output's initial length, so we can rewind later.
				356	const int output_begin = output->length();
				357
				358	if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) {
				359	// After all the other canonicalization, check if we ended up with an IP
				360	// address. IP addresses are small, so writing into this temporary buffer
				361	// should not cause an allocation.
				362	RawCanonOutput<64> canon_ip;
				363	CanonicalizeIPAddress(output->data(),
				364	MakeRange(output_begin, output->length()),
				365	&canon_ip, host_info);
				366
				367	// If we got an IPv4/IPv6 address, copy the canonical form back to the
				368	// real buffer. Otherwise, it's a hostname or broken IP, in which case
				369	// we just leave it in place.
				370	if (host_info->IsIPAddress()) {
				371	output->set_length(output_begin);
				372	output->Append(canon_ip.data(), canon_ip.length());
				373	}
				374	} else {
				375	// Canonicalization failed. Set BROKEN to notify the caller.
				376	host_info->family = CanonHostInfo::BROKEN;
				377	}
				378
				379	host_info->out_host = MakeRange(output_begin, output->length());
				380	}
				381
				382	} // namespace
				383
				384	bool CanonicalizeHost(const char* spec,
				385	const Component& host,
				386	CanonOutput* output,
				387	Component* out_host) {
				388	CanonHostInfo host_info;
				389	DoHost<char, unsigned char>(spec, host, output, &host_info);
				390	*out_host = host_info.out_host;
				391	return (host_info.family != CanonHostInfo::BROKEN);
				392	}
				393
				394	bool CanonicalizeHost(const base::char16* spec,
				395	const Component& host,
				396	CanonOutput* output,
				397	Component* out_host) {
				398	CanonHostInfo host_info;
				399	DoHost<base::char16, base::char16>(spec, host, output, &host_info);
				400	*out_host = host_info.out_host;
				401	return (host_info.family != CanonHostInfo::BROKEN);
				402	}
				403
				404	void CanonicalizeHostVerbose(const char* spec,
				405	const Component& host,
				406	CanonOutput* output,
				407	CanonHostInfo* host_info) {
				408	DoHost<char, unsigned char>(spec, host, output, host_info);
				409	}
				410
				411	void CanonicalizeHostVerbose(const base::char16* spec,
				412	const Component& host,
				413	CanonOutput* output,
				414	CanonHostInfo* host_info) {
				415	DoHost<base::char16, base::char16>(spec, host, output, host_info);
				416	}
				417
				418	bool CanonicalizeHostSubstring(const char* spec,
				419	const Component& host,
				420	CanonOutput* output) {
				421	return DoHostSubstring<char, unsigned char>(spec, host, output);
				422	}
				423
				424	bool CanonicalizeHostSubstring(const base::char16* spec,
				425	const Component& host,
				426	CanonOutput* output) {
				427	return DoHostSubstring<base::char16, base::char16>(spec, host, output);
				428	}
				429
				430	} // namespace url