blob: 6f2aaf4e10f29ece6649d2353ca4b839f43be187 [file] [log] [blame]
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/strings/utf_offset_string_conversions.h"
#include <stdint.h>
#include <algorithm>
#include <memory>
#include <string_view>
#include "base/logging.h"
#include "base/strings/utf_string_conversion_utils.h"
namespace base {
OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
size_t original_length,
size_t output_length)
: original_offset(original_offset),
original_length(original_length),
output_length(output_length) {}
// static
void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
std::vector<size_t>* offsets_for_adjustment,
size_t limit) {
DCHECK(offsets_for_adjustment);
for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
i != offsets_for_adjustment->end(); ++i)
AdjustOffset(adjustments, &(*i), limit);
}
// static
void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
size_t* offset,
size_t limit) {
DCHECK(offset);
if (*offset == std::u16string::npos)
return;
int adjustment = 0;
for (Adjustments::const_iterator i = adjustments.begin();
i != adjustments.end(); ++i) {
if (*offset <= i->original_offset)
break;
if (*offset < (i->original_offset + i->original_length)) {
*offset = std::u16string::npos;
return;
}
adjustment += static_cast<int>(i->original_length - i->output_length);
}
*offset -= adjustment;
if (*offset > limit)
*offset = std::u16string::npos;
}
// static
void OffsetAdjuster::UnadjustOffsets(
const Adjustments& adjustments,
std::vector<size_t>* offsets_for_unadjustment) {
if (!offsets_for_unadjustment || adjustments.empty())
return;
for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
i != offsets_for_unadjustment->end(); ++i)
UnadjustOffset(adjustments, &(*i));
}
// static
void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
size_t* offset) {
if (*offset == std::u16string::npos)
return;
int adjustment = 0;
for (Adjustments::const_iterator i = adjustments.begin();
i != adjustments.end(); ++i) {
if (*offset + adjustment <= i->original_offset)
break;
adjustment += static_cast<int>(i->original_length - i->output_length);
if ((*offset + adjustment) < (i->original_offset + i->original_length)) {
*offset = std::u16string::npos;
return;
}
}
*offset += adjustment;
}
// static
void OffsetAdjuster::MergeSequentialAdjustments(
const Adjustments& first_adjustments,
Adjustments* adjustments_on_adjusted_string) {
Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
Adjustments::const_iterator first_iter = first_adjustments.begin();
// Simultaneously iterate over all |adjustments_on_adjusted_string| and
// |first_adjustments|, adding adjustments to or correcting the adjustments
// in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the
// current number of characters collapsed by |first_adjustments| up to this
// point. |currently_collapsing| keeps track of the number of characters
// collapsed by |first_adjustments| into the current |adjusted_iter|'s
// length. These are characters that will change |shift| as soon as we're
// done processing the current |adjusted_iter|; they are not yet reflected in
// |shift|.
size_t shift = 0;
size_t currently_collapsing = 0;
while (adjusted_iter != adjustments_on_adjusted_string->end()) {
if ((first_iter == first_adjustments.end()) ||
((adjusted_iter->original_offset + shift +
adjusted_iter->original_length) <= first_iter->original_offset)) {
// Entire |adjusted_iter| (accounting for its shift and including its
// whole original length) comes before |first_iter|.
//
// Correct the offset at |adjusted_iter| and move onto the next
// adjustment that needs revising.
adjusted_iter->original_offset += shift;
shift += currently_collapsing;
currently_collapsing = 0;
++adjusted_iter;
} else if ((adjusted_iter->original_offset + shift) >
first_iter->original_offset) {
// |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
// It's not possible for the adjustments to overlap. (It shouldn't
// be possible that we have an |adjusted_iter->original_offset| that,
// when adjusted by the computed |shift|, is in the middle of
// |first_iter|'s output's length. After all, that would mean the
// current adjustment_on_adjusted_string somehow points to an offset
// that was supposed to have been eliminated by the first set of
// adjustments.)
DCHECK_LE(first_iter->original_offset + first_iter->output_length,
adjusted_iter->original_offset + shift);
// Add the |first_adjustment_iter| to the full set of adjustments while
// making sure |adjusted_iter| continues pointing to the same element.
// We do this by inserting the |first_adjustment_iter| right before
// |adjusted_iter|, then incrementing |adjusted_iter| so it points to
// the following element.
shift += first_iter->original_length - first_iter->output_length;
adjusted_iter =
adjustments_on_adjusted_string->insert(adjusted_iter, *first_iter);
++adjusted_iter;
++first_iter;
} else {
// The first adjustment adjusted something that then got further adjusted
// by the second set of adjustments. In other words, |first_iter| points
// to something in the range covered by |adjusted_iter|'s length (after
// accounting for |shift|). Precisely,
// adjusted_iter->original_offset + shift
// <=
// first_iter->original_offset
// <=
// adjusted_iter->original_offset + shift +
// adjusted_iter->original_length
// Modify the current |adjusted_iter| to include whatever collapsing
// happened in |first_iter|, then advance to the next |first_adjustments|
// because we dealt with the current one.
const int collapse = static_cast<int>(first_iter->original_length) -
static_cast<int>(first_iter->output_length);
// This function does not know how to deal with a string that expands and
// then gets modified, only strings that collapse and then get modified.
DCHECK_GT(collapse, 0);
adjusted_iter->original_length += collapse;
currently_collapsing += collapse;
++first_iter;
}
}
DCHECK_EQ(0u, currently_collapsing);
if (first_iter != first_adjustments.end()) {
// Only first adjustments are left. These do not need to be modified.
// (Their offsets are already correct with respect to the original string.)
// Append them all.
DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
adjustments_on_adjusted_string->insert(
adjustments_on_adjusted_string->end(), first_iter,
first_adjustments.end());
}
}
// Converts the given source Unicode character type to the given destination
// Unicode character type as a STL string. The given input buffer and size
// determine the source, and the given output STL string will be replaced by
// the result. If non-NULL, |adjustments| is set to reflect the all the
// alterations to the string that are not one-character-to-one-character.
// It will always be sorted by increasing offset.
template <typename SrcChar, typename DestStdString>
bool ConvertUnicode(const SrcChar* src,
size_t src_len,
DestStdString* output,
OffsetAdjuster::Adjustments* adjustments) {
if (adjustments)
adjustments->clear();
// ICU requires 32-bit numbers.
bool success = true;
int32_t src_len32 = static_cast<int32_t>(src_len);
for (int32_t i = 0; i < src_len32; i++) {
uint32_t code_point;
size_t original_i = i;
size_t chars_written = 0;
if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
chars_written = WriteUnicodeCharacter(code_point, output);
} else {
chars_written = WriteUnicodeCharacter(0xFFFD, output);
success = false;
}
// Only bother writing an adjustment if this modification changed the
// length of this character.
// NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
// character read, not after it (so that incrementing it in the loop
// increment will place it at the right location), so we need to account
// for that in determining the amount that was read.
if (adjustments && ((i - original_i + 1) != chars_written)) {
adjustments->push_back(OffsetAdjuster::Adjustment(
original_i, i - original_i + 1, chars_written));
}
}
return success;
}
bool UTF8ToUTF16WithAdjustments(
const char* src,
size_t src_len,
std::u16string* output,
base::OffsetAdjuster::Adjustments* adjustments) {
PrepareForUTF16Or32Output(src, src_len, output);
return ConvertUnicode(src, src_len, output, adjustments);
}
std::u16string UTF8ToUTF16WithAdjustments(
std::string_view utf8,
base::OffsetAdjuster::Adjustments* adjustments) {
std::u16string result;
UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
return result;
}
std::u16string UTF8ToUTF16AndAdjustOffsets(
std::string_view utf8,
std::vector<size_t>* offsets_for_adjustment) {
for (size_t& offset : *offsets_for_adjustment) {
if (offset > utf8.length())
offset = std::u16string::npos;
}
OffsetAdjuster::Adjustments adjustments;
std::u16string result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
return result;
}
std::string UTF16ToUTF8AndAdjustOffsets(
std::u16string_view utf16,
std::vector<size_t>* offsets_for_adjustment) {
for (size_t& offset : *offsets_for_adjustment) {
if (offset > utf16.length())
offset = std::u16string::npos;
}
std::string result;
PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
OffsetAdjuster::Adjustments adjustments;
ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
return result;
}
} // namespace base