blob: e61fa707aad573d1c614efe166b0109d7881833a [file] [log] [blame]
/*
* Copyright 2011 Google Inc. All rights reserved.
* Copyright 2012 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// ICU integration functions.
#include "config.h"
#if USE(WTFURL)
#include "URLCanonInternal.h" // for _itoa_s
#include <stdlib.h>
#include <string.h>
#include <unicode/ucnv.h>
#include <unicode/ucnv_cb.h>
#include <unicode/uidna.h>
namespace WTF {
namespace URLCanonicalizer {
namespace {
// Called when converting a character that can not be represented, this will
// append an escaped version of the numerical character reference for that code
// point. It is of the form "&#1234;" and we will escape the non-digits to
// "%26%231234%3B". Why? This is what Netscape did back in the olden days.
void appendURLEscapedChar(const void* /* context */,
UConverterFromUnicodeArgs* fromArgs,
const UChar* /* code_units */,
int32_t /* length */,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode* err)
{
if (reason == UCNV_UNASSIGNED) {
*err = U_ZERO_ERROR;
const static int prefixLength = 6;
const static char prefix[prefixLength + 1] = "%26%23"; // "&#" percent-escaped
ucnv_cbFromUWriteBytes(fromArgs, prefix, prefixLength, 0, err);
ASSERT(codePoint < 0x110000);
char number[8]; // Max Unicode code point is 7 digits.
_itoa_s(codePoint, number, 10);
int numberLength = static_cast<int>(strlen(number));
ucnv_cbFromUWriteBytes(fromArgs, number, numberLength, 0, err);
const static int postfixLength = 3;
const static char postfix[postfixLength + 1] = "%3B"; // ";" percent-escaped
ucnv_cbFromUWriteBytes(fromArgs, postfix, postfixLength, 0, err);
}
}
// A class for scoping the installation of the invalid character callback.
class AppendHandlerInstaller {
public:
// The owner of this object must ensure that the converter is alive for the
// duration of this object's lifetime.
AppendHandlerInstaller(UConverter* converter)
: m_converter(converter)
{
UErrorCode err = U_ZERO_ERROR;
ucnv_setFromUCallBack(m_converter, appendURLEscapedChar, 0, &m_oldCallback, &m_oldContext, &err);
}
~AppendHandlerInstaller()
{
UErrorCode err = U_ZERO_ERROR;
ucnv_setFromUCallBack(m_converter, m_oldCallback, m_oldContext, 0, 0, &err);
}
private:
UConverter* m_converter;
UConverterFromUCallback m_oldCallback;
const void* m_oldContext;
};
} // namespace
// Converts the Unicode input representing a hostname to ASCII using IDN rules.
// The output must be ASCII, but is represented as wide characters.
//
// On success, the output will be filled with the ASCII host name and it will
// return true. Unlike most other canonicalization functions, this assumes that
// the output is empty. The beginning of the host will be at offset 0, and
// the length of the output will be set to the length of the new host name.
//
// On error, this will return false. The output in this case is undefined.
bool IDNToASCII(const UChar* src, int sourceLength, URLBuffer<UChar>& output)
{
ASSERT(!output.length()); // Output buffer is assumed empty.
while (true) {
// Use ALLOW_UNASSIGNED to be more tolerant of hostnames that violate
// the spec (which do exist). This does not present any risk and is a
// little more future proof.
UErrorCode err = U_ZERO_ERROR;
int numConverted = uidna_IDNToASCII(src, sourceLength, output.data(),
output.capacity(),
UIDNA_ALLOW_UNASSIGNED, 0, &err);
if (err == U_ZERO_ERROR) {
output.setLength(numConverted);
return true;
}
if (err != U_BUFFER_OVERFLOW_ERROR)
return false; // Unknown error, give up.
// Not enough room in our buffer, expand.
output.resize(output.capacity() * 2);
}
}
bool readUTFChar(const char* str, int* begin, int length, unsigned* codePointOut)
{
int codePoint; // Avoids warning when U8_NEXT writes -1 to it.
U8_NEXT(str, *begin, length, codePoint);
*codePointOut = static_cast<unsigned>(codePoint);
// The ICU macro above moves to the next char, we want to point to the last
// char consumed.
(*begin)--;
// Validate the decoded value.
if (U_IS_UNICODE_CHAR(codePoint))
return true;
*codePointOut = kUnicodeReplacementCharacter;
return false;
}
bool readUTFChar(const UChar* str, int* begin, int length, unsigned* codePoint)
{
if (U16_IS_SURROGATE(str[*begin])) {
if (!U16_IS_SURROGATE_LEAD(str[*begin]) || *begin + 1 >= length || !U16_IS_TRAIL(str[*begin + 1])) {
// Invalid surrogate pair.
*codePoint = kUnicodeReplacementCharacter;
return false;
}
// Valid surrogate pair.
*codePoint = U16_GET_SUPPLEMENTARY(str[*begin], str[*begin + 1]);
(*begin)++;
} else {
// Not a surrogate, just one 16-bit word.
*codePoint = str[*begin];
}
if (U_IS_UNICODE_CHAR(*codePoint))
return true;
// Invalid code point.
*codePoint = kUnicodeReplacementCharacter;
return false;
}
} // namespace URLCanonicalizer
} // namespace WTF
#endif // USE(WTFURL)