blob: 5ea2f9906d64f1ae444ed0d297931c630684eb9c [file] [log] [blame]
/*
* Copyright 2007 Google Inc. All rights reserved.
* Copyright 2012 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "URLCanon.h"
#include "RawURLBuffer.h"
#include "URLCanonInternal.h"
#if USE(WTFURL)
namespace WTF {
namespace URLCanonicalizer {
namespace {
// For reference, here's what IE supports:
// Key: 0 (disallowed: failure if present in the input)
// + (allowed either escaped or unescaped, and unmodified)
// U (allowed escaped or unescaped but always unescaped if present in
// escaped form)
// E (allowed escaped or unescaped but always escaped if present in
// unescaped form)
// % (only allowed escaped in the input, will be unmodified).
// I left blank alpha numeric characters.
//
// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
// -----------------------------------------------
// 0 0 E E E E E E E E E E E E E E E
// 1 E E E E E E E E E E E E E E E E
// 2 E + E E + E + + + + + + + U U 0
// 3 % % E + E 0 <-- Those are : ; < = > ?
// 4 %
// 5 U 0 U U U <-- Those are [ \ ] ^ _
// 6 E <-- That's `
// 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE)
//
// NOTE: I didn't actually test all the control characters. Some may be
// disallowed in the input, but they are all accepted escaped except for 0.
// I also didn't test if characters affecting HTML parsing are allowed
// unescaped, eg. (") or (#), which would indicate the beginning of the path.
// Surprisingly, space is accepted in the input and always escaped.
// This table lists the canonical version of all characters we allow in the
// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
// value to indicate that this character should be escaped. We are a little more
// restrictive than IE, but less restrictive than Firefox.
//
// Note that we disallow the % character. We will allow it when part of an
// escape sequence, of course, but this disallows "%25". Even though IE allows
// it, allowing it would put us in a funny state. If there was an invalid
// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
// Allowing percents means we'll succeed a second time, so validity would change
// based on how many times you run the canonicalizer. We prefer to always report
// the same vailidity, so reject this.
const unsigned char kEsc = 0xff;
const unsigned char kHostCharLookup[0x80] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// ' ' ! " # $ % & ' ( ) * + , - . /
kEsc, kEsc, kEsc, kEsc, kEsc, 0, kEsc, kEsc, kEsc, kEsc, kEsc, '+', kEsc, '-', '.', 0,
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 , kEsc, kEsc, kEsc, 0 ,
// @ A B C D E F G H I J K L M N O
kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
// P Q R S T U V W X Y Z [ \ ] ^ _
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
// ` a b c d e f g h i j k l m n o
kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
// p q r s t u v w x y z { | } ~
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', kEsc, kEsc, kEsc, 0 , 0 };
typedef RawURLBuffer<char> StackBuffer;
typedef RawURLBuffer<UChar> StackBufferW;
// Scans a host name and fills in the output flags according to what we find.
// |hasNonASCII| will be true if there are any non-7-bit characters, and
// |hasEscaped| will be true if there is a percent sign.
template<typename CharacterType, typename UCHAR>
void scanHostname(const CharacterType* spec, const URLComponent& host, bool& hasNonASCII, bool& hasEscaped)
{
int end = host.end();
hasNonASCII = false;
hasEscaped = false;
for (int i = host.begin(); i < end; ++i) {
if (static_cast<UCHAR>(spec[i]) >= 0x80)
hasNonASCII = true;
else if (spec[i] == '%')
hasEscaped = true;
}
}
// Canonicalizes a host name that is entirely 8-bit characters (even though
// the type holding them may be 16 bits. Escaped characters will be unescaped.
// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
//
// The |*hasNonASCII| flag will be true if there are non-7-bit characters in
// the output.
//
// This function is used in two situations:
//
// * When the caller knows there is no non-ASCII or percent escaped
// characters. This is what DoHost does. The result will be a completely
// canonicalized host since we know nothing weird can happen (escaped
// characters could be unescaped to non-7-bit, so they have to be treated
// with suspicion at this point). It does not use the |hasNonASCII| flag.
//
// * When the caller has an 8-bit string that may need unescaping.
// doComplexHost calls us this situation to do unescaping and validation.
// After this, it may do other IDN operations depending on the value of the
// |*hasNonASCII| flag.
//
// The return value indicates if the output is a potentially valid host name.
template<typename INCHAR, typename OUTCHAR>
bool doSimpleHost(const INCHAR* host, int hostLength, URLBuffer<OUTCHAR>& output, bool& hasNonASCII)
{
hasNonASCII = false;
bool success = true;
for (int i = 0; i < hostLength; ++i) {
unsigned source = host[i];
if (source == '%') {
// Unescape first, if possible.
// Source will be used only if decode operation was successful.
if (!DecodeEscaped(host, &i, hostLength,
reinterpret_cast<unsigned char*>(&source))) {
// Invalid escaped character. There is nothing that can make this
// host valid. We append an escaped percent so the URL looks reasonable
// and mark as failed.
appendURLEscapedCharacter('%', output);
success = false;
continue;
}
}
if (source < 0x80) {
// We have ASCII input, we can use our lookup table.
unsigned char replacement = kHostCharLookup[source];
if (!replacement) {
// Invalid character, add it as percent-escaped and mark as failed.
appendURLEscapedCharacter(source, output);
success = false;
} else if (replacement == kEsc) {
// This character is valid but should be escaped.
appendURLEscapedCharacter(source, output);
} else {
// Common case, the given character is valid in a hostname, the lookup
// table tells us the canonical representation of that character (lower
// cased).
output.append(replacement);
}
} else {
// It's a non-ascii char. Just push it to the output.
// In case where we have UChar input, and char output it's safe to
// cast UChar->char only if input string was converted to ASCII.
output.append(static_cast<OUTCHAR>(source));
hasNonASCII = true;
}
}
return success;
}
// Canonicalizes a host that requires IDN conversion. Returns true on success
bool doIDNHost(const UChar* src, int sourceLength, URLBuffer<char>& output)
{
// We need to escape URL before doing IDN conversion, since punicode strings
// cannot be escaped after they are created.
RawURLBuffer<UChar> urlEscapedHost;
bool hasNonASCII;
doSimpleHost(src, sourceLength, urlEscapedHost, hasNonASCII);
StackBufferW wideOutput;
if (!IDNToASCII(urlEscapedHost.data(),
urlEscapedHost.length(),
wideOutput)) {
// Some error, give up. This will write some reasonable looking
// representation of the string to the output.
AppendInvalidNarrowString(src, 0, sourceLength, output);
return false;
}
// Now we check the ASCII output like a normal host. It will also handle
// unescaping. Although we unescaped everything before this function call, if
// somebody does %00 as fullwidth, ICU will convert this to ASCII.
bool success = doSimpleHost(wideOutput.data(), wideOutput.length(), output, hasNonASCII);
ASSERT(!hasNonASCII);
return success;
}
// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
// UTF-16. The hasEscaped flag should be set if the input string requires
// unescaping.
bool doComplexHost(const char* host, int hostLength, bool hasNonASCII, bool hasEscaped, URLBuffer<char>& output)
{
// Save the current position in the output. We may write stuff and rewind it
// below, so we need to know where to rewind to.
int beginLength = output.length();
// Points to the UTF-8 data we want to convert. This will either be the
// input or the unescaped version written to |output| if necessary.
const char* utf8Source;
int utf8SourceLength;
if (hasEscaped) {
// Unescape before converting to UTF-16 for IDN. We write this into the
// output because it most likely does not require IDNization, and we can
// save another huge stack buffer. It will be replaced below if it requires
// IDN. This will also update our non-ASCII flag so we know whether the
// unescaped input requires IDN.
if (!doSimpleHost(host, hostLength, output, hasNonASCII)) {
// Error with some escape sequence. We'll call the current output
// complete. doSimpleHost will have written some "reasonable" output.
return false;
}
// Unescaping may have left us with ASCII input, in which case the
// unescaped version we wrote to output is complete.
if (!hasNonASCII)
return true;
// Save the pointer into the data was just converted (it may be appended to
// other data in the output buffer).
utf8Source = &output.data()[beginLength];
utf8SourceLength = output.length() - beginLength;
} else {
// We don't need to unescape, use input for IDNization later. (We know the
// input has non-ASCII, or the simple version would have been called
// instead of us.)
utf8Source = host;
utf8SourceLength = hostLength;
}
// Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
// Above, we may have used the output to write the unescaped values to, so
// we have to rewind it to where we started after we convert it to UTF-16.
StackBufferW utf16;
if (!ConvertUTF8ToUTF16(utf8Source, utf8SourceLength, utf16)) {
// In this error case, the input may or may not be the output.
StackBuffer utf8;
for (int i = 0; i < utf8SourceLength; i++)
utf8.append(utf8Source[i]);
output.setLength(beginLength);
AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
return false;
}
output.setLength(beginLength);
// This will call doSimpleHost which will do normal ASCII canonicalization
// and also check for IP addresses in the outpt.
return doIDNHost(utf16.data(), utf16.length(), output);
}
// UTF-16 convert host to its ASCII version. The set up is already ready for
// the backend, so we just pass through. The hasEscaped flag should be set if
// the input string requires unescaping.
bool doComplexHost(const UChar* host, int hostLength, bool hasNonASCII, bool hasEscaped, URLBuffer<char>& output)
{
if (hasEscaped) {
// Yikes, we have escaped characters with wide input. The escaped
// characters should be interpreted as UTF-8. To solve this problem,
// we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
//
// We don't bother to optimize the conversion in the ASCII case (which
// *could* just be a copy) and use the UTF-8 path, because it should be
// very rare that host names have escaped characters, and it is relatively
// fast to do the conversion anyway.
StackBuffer utf8;
if (!ConvertUTF16ToUTF8(host, hostLength, utf8)) {
AppendInvalidNarrowString(host, 0, hostLength, output);
return false;
}
// Once we convert to UTF-8, we can use the 8-bit version of the complex
// host handling code above.
return doComplexHost(utf8.data(), utf8.length(), hasNonASCII,
hasEscaped, output);
}
// No unescaping necessary, we can safely pass the input to ICU. This
// function will only get called if we either have escaped or non-ascii
// input, so it's safe to just use ICU now. Even if the input is ASCII,
// this function will do the right thing (just slower than we could).
return doIDNHost(host, hostLength, output);
}
template<typename CharacterType, typename UCHAR>
void doHost(const CharacterType* spec, const URLComponent& host, URLBuffer<char>& output, CanonHostInfo& hostInfo)
{
if (host.length() <= 0) {
// Empty hosts don't need anything.
hostInfo.family = CanonHostInfo::NEUTRAL;
hostInfo.ouputHost = URLComponent();
return;
}
bool hasNonASCII;
bool hasEscaped;
scanHostname<CharacterType, UCHAR>(spec, host, hasNonASCII, hasEscaped);
// Keep track of output's initial length, so we can rewind later.
const int outputBegin = output.length();
bool success;
if (!hasNonASCII && !hasEscaped) {
success = doSimpleHost(&spec[host.begin()], host.length(), output, hasNonASCII);
ASSERT(!hasNonASCII);
} else
success = doComplexHost(&spec[host.begin()], host.length(), hasNonASCII, hasEscaped, output);
if (!success) {
// Canonicalization failed. Set BROKEN to notify the caller.
hostInfo.family = CanonHostInfo::BROKEN;
} else {
// After all the other canonicalization, check if we ended up with an IP
// address. IP addresses are small, so writing into this temporary buffer
// should not cause an allocation.
RawURLBuffer<char, 64> canon_ip;
canonicalizeIPAddress(output.data(), URLComponent::fromRange(outputBegin, output.length()), canon_ip, hostInfo);
// If we got an IPv4/IPv6 address, copy the canonical form back to the
// real buffer. Otherwise, it's a hostname or broken IP, in which case
// we just leave it in place.
if (hostInfo.IsIPAddress()) {
output.setLength(outputBegin);
output.append(canon_ip.data(), canon_ip.length());
}
}
hostInfo.ouputHost = URLComponent::fromRange(outputBegin, output.length());
}
} // namespace
bool canonicalizeHost(const char* spec, const URLComponent& host, URLBuffer<char>& output, URLComponent& ouputHost)
{
CanonHostInfo hostInfo;
doHost<char, unsigned char>(spec, host, output, hostInfo);
ouputHost = hostInfo.ouputHost;
return (hostInfo.family != CanonHostInfo::BROKEN);
}
bool canonicalizeHost(const UChar* spec, const URLComponent& host, URLBuffer<char>& output, URLComponent& ouputHost)
{
CanonHostInfo hostInfo;
doHost<UChar, UChar>(spec, host, output, hostInfo);
ouputHost = hostInfo.ouputHost;
return (hostInfo.family != CanonHostInfo::BROKEN);
}
} // namespace URLCanonicalizer
} // namespace WTF
#endif // USE(WTFURL)