blob: b6abd568d5a24996798b82e4d297e3c46dace01d [file] [log] [blame]
// Copyright 2021 The Cobalt Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <cctype>
#include "cobalt/encoding/text_decoder.h"
#include "third_party/icu/source/common/unicode/uchriter.h"
#include "third_party/icu/source/common/unicode/ucnv.h"
#include "third_party/icu/source/common/unicode/unistr.h"
#include "third_party/icu/source/common/unicode/utypes.h"
namespace cobalt {
namespace encoding {
// static
const std::size_t TextDecoder::kConversionBufferSize = 16384; // 2^14
const TextDecoderOptions TextDecoder::kDefaultDecoderOptions;
const char TextDecoder::kDefaultEncoding[] = "utf-8";
const UChar32 TextDecoder::kReplacementCharacter = 0xFFFD;
const char TextDecoder::kReplacementEncoding[] = "REPLACEMENT";
namespace {
std::string ToLower(std::string str) {
std::transform(str.begin(), str.end(), str.begin(),
[](unsigned char c) { return std::tolower(c); });
return str;
}
} // namespace
void TextDecoder::Cleanup() {
encoding_.clear();
if (converter_) {
ucnv_close(converter_);
}
converter_ = nullptr;
}
void TextDecoder::Setup(std::string label,
script::ExceptionState* exception_state,
const TextDecoderOptions& options) {
UErrorCode error_code = U_ZERO_ERROR;
converter_ = ucnv_open(label.c_str(), &error_code);
if (U_FAILURE(error_code)) {
LOG(ERROR) << "Unable to open converter. " << u_errorName(error_code);
exception_state->SetSimpleException(script::kRangeError,
"label %s is invalid", label.c_str());
return Cleanup();
}
error_code = U_ZERO_ERROR;
encoding_ = ucnv_getName(converter_, &error_code);
if (U_FAILURE(error_code) || encoding_ == kReplacementEncoding) {
LOG(ERROR) << "Unable to get encoding name. " << u_errorName(error_code);
exception_state->SetSimpleException(
script::kRangeError, "encoding %s is invalid", encoding_.c_str());
return Cleanup();
}
// Internal default setup;
encoding_ = ToLower(encoding_);
error_mode_ = options.fatal() ? "fatal" : "replacement";
ignore_bom_ = options.ignore_bom();
bom_seen_ = false;
do_not_flush_ = false;
}
TextDecoder::TextDecoder(script::ExceptionState* exception_state)
: converter_(nullptr) {
Setup(kDefaultEncoding, exception_state, kDefaultDecoderOptions);
}
TextDecoder::TextDecoder(const std::string& label,
script::ExceptionState* exception_state)
: converter_(nullptr) {
Setup(label, exception_state, kDefaultDecoderOptions);
}
TextDecoder::TextDecoder(const TextDecoderOptions& options,
script::ExceptionState* exception_state) {
Setup(kDefaultEncoding, exception_state, options);
}
TextDecoder::TextDecoder(const std::string& label,
const TextDecoderOptions& options,
script::ExceptionState* exception_state) {
Setup(label, exception_state, options);
}
TextDecoder::~TextDecoder() { Cleanup(); }
std::string TextDecoder::Decode(script::ExceptionState* exception_state) {
const TextDecodeOptions default_options;
return Decode(default_options, exception_state);
}
std::string TextDecoder::Decode(const dom::BufferSource& input,
script::ExceptionState* exception_state) {
const TextDecodeOptions default_options;
return Decode(input, default_options, exception_state);
}
std::string TextDecoder::Decode(const TextDecodeOptions& options,
script::ExceptionState* exception_state) {
std::string result;
Decode(nullptr, 0, options, exception_state, &result);
return result;
}
std::string TextDecoder::Decode(const dom::BufferSource& input,
const TextDecodeOptions& options,
script::ExceptionState* exception_state) {
int32_t size;
const uint8* buffer;
std::string result;
dom::GetBufferAndSize(input, &buffer, &size);
Decode(reinterpret_cast<const char*>(buffer), size, options, exception_state,
&result);
return result;
}
// General Decode method, all public methods end here.
void TextDecoder::Decode(const char* start, int32_t length,
const TextDecodeOptions& options,
script::ExceptionState* exception_state,
std::string* output) {
if (!converter_) {
return;
}
if (!do_not_flush_) {
bom_seen_ = false;
}
if (!ignore_bom_ && !bom_seen_) {
bom_seen_ = true;
if (!RemoveBOM(start, length, exception_state)) {
return;
}
}
do_not_flush_ = options.stream();
bool saw_error = false;
DecodeImpl(start, length, !do_not_flush_, error_mode_ == "fatal", saw_error,
output);
if (error_mode_ == "fatal" && saw_error) {
if (!do_not_flush_) {
// If flushing, the error should not persist.
ucnv_reset(converter_);
}
exception_state->SetSimpleException(script::kTypeError,
"Error processing the encoded data");
}
return;
}
bool TextDecoder::RemoveBOM(const char*& buffer, int& size,
script::ExceptionState* exception_state) {
UErrorCode error_code = U_ZERO_ERROR;
int32_t signature_length = 0;
const char* detected_encoding =
ucnv_detectUnicodeSignature(buffer, size, &signature_length, &error_code);
if (U_FAILURE(error_code)) {
LOG(ERROR) << "Error detecting signature " << u_errorName(error_code);
exception_state->SetSimpleException(script::kRangeError,
"Error detecting signature");
return false;
}
// Only remove BOM when both encodings match.
if (detected_encoding && (ToLower(detected_encoding) == encoding_)) {
buffer += signature_length;
size -= signature_length;
LOG(INFO) << "Buffer adjusted (+" << signature_length << ") bytes.";
}
return true;
}
class ErrorCallbackSetter final {
public:
ErrorCallbackSetter(UConverter* converter, bool stop_on_error)
: converter_(converter), stop_on_error_(stop_on_error) {
if (stop_on_error_) {
UErrorCode error = U_ZERO_ERROR;
ucnv_setToUCallBack(converter_, UCNV_TO_U_CALLBACK_STOP, nullptr,
&saved_action_, &saved_context_, &error);
DCHECK_EQ(error, U_ZERO_ERROR);
}
}
~ErrorCallbackSetter() {
if (stop_on_error_) {
UErrorCode error = U_ZERO_ERROR;
const void* old_context;
UConverterToUCallback old_action;
ucnv_setToUCallBack(converter_, saved_action_, saved_context_,
&old_action, &old_context, &error);
DCHECK_EQ(old_action, UCNV_TO_U_CALLBACK_STOP);
DCHECK(!old_context);
DCHECK_EQ(error, U_ZERO_ERROR);
}
}
private:
UConverter* converter_;
bool stop_on_error_;
const void* saved_context_;
UConverterToUCallback saved_action_;
};
namespace {
int DecodeTo(UConverter* converter_, UChar* target, UChar* target_limit,
const char*& source, const char* source_limit, int32_t* offsets,
bool flush, UErrorCode& error) {
UChar* target_start = target;
error = U_ZERO_ERROR;
ucnv_toUnicode(converter_, &target, target_limit, &source, source_limit,
offsets, flush, &error);
return static_cast<int>(target - target_start);
}
} // namespace
void TextDecoder::DecodeImpl(const char* bytes, int length, bool flush,
bool stop_on_error, bool& saw_error,
std::string* output) {
if (!converter_) {
LOG(WARNING) << "no converter available.";
return;
}
ErrorCallbackSetter callback_setter(converter_, stop_on_error);
icu::UnicodeString result;
UChar buffer[kConversionBufferSize];
UChar* buffer_limit = buffer + kConversionBufferSize;
const char* source = reinterpret_cast<const char*>(bytes);
const char* source_limit = source + length;
int32_t* offsets = nullptr;
UErrorCode error = U_ZERO_ERROR;
do {
int uchars_decoded = DecodeTo(converter_, buffer, buffer_limit, source,
source_limit, offsets, flush, error);
result.append(buffer, uchars_decoded);
} while (error == U_BUFFER_OVERFLOW_ERROR);
// Flush the converter so it can be reused.
if (U_FAILURE(error)) {
do {
DecodeTo(converter_, buffer, buffer_limit, source, source_limit, offsets,
true, error);
} while (source < source_limit);
saw_error = true;
}
result.toUTF8String(*output);
return;
}
} // namespace encoding
} // namespace cobalt