[email protected] | 421de2a | 2011-04-13 18:43:05 | [diff] [blame] | 1 | // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
[email protected] | a3f7218 | 2013-02-07 03:59:06 | [diff] [blame] | 5 | #include "base/strings/utf_offset_string_conversions.h" |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 6 | |
[email protected] | 421de2a | 2011-04-13 18:43:05 | [diff] [blame] | 7 | #include <algorithm> |
| 8 | |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 9 | #include "base/logging.h" |
[email protected] | e57a716 | 2011-06-15 04:14:23 | [diff] [blame] | 10 | #include "base/memory/scoped_ptr.h" |
[email protected] | eb62f72 | 2013-03-30 14:29:00 | [diff] [blame] | 11 | #include "base/strings/string_piece.h" |
[email protected] | a3f7218 | 2013-02-07 03:59:06 | [diff] [blame] | 12 | #include "base/strings/utf_string_conversion_utils.h" |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 13 | |
[email protected] | a3f7218 | 2013-02-07 03:59:06 | [diff] [blame] | 14 | namespace base { |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 15 | |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 16 | OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, |
| 17 | size_t original_length, |
| 18 | size_t output_length) |
| 19 | : original_offset(original_offset), |
| 20 | original_length(original_length), |
| 21 | output_length(output_length) { |
| 22 | } |
| 23 | |
| 24 | // static |
| 25 | void OffsetAdjuster::AdjustOffsets( |
| 26 | const Adjustments& adjustments, |
| 27 | std::vector<size_t>* offsets_for_adjustment) { |
| 28 | if (!offsets_for_adjustment || adjustments.empty()) |
| 29 | return; |
| 30 | for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin()); |
| 31 | i != offsets_for_adjustment->end(); ++i) |
| 32 | AdjustOffset(adjustments, &(*i)); |
| 33 | } |
| 34 | |
| 35 | // static |
| 36 | void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, |
| 37 | size_t* offset) { |
| 38 | if (*offset == string16::npos) |
| 39 | return; |
[email protected] | 529d4b5 | 2014-04-28 19:37:23 | [diff] [blame] | 40 | int adjustment = 0; |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 41 | for (Adjustments::const_iterator i = adjustments.begin(); |
| 42 | i != adjustments.end(); ++i) { |
| 43 | if (*offset <= i->original_offset) |
| 44 | break; |
| 45 | if (*offset < (i->original_offset + i->original_length)) { |
| 46 | *offset = string16::npos; |
| 47 | return; |
| 48 | } |
[email protected] | 529d4b5 | 2014-04-28 19:37:23 | [diff] [blame] | 49 | adjustment += static_cast<int>(i->original_length - i->output_length); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 50 | } |
| 51 | *offset -= adjustment; |
| 52 | } |
| 53 | |
| 54 | // static |
[email protected] | 529d4b5 | 2014-04-28 19:37:23 | [diff] [blame] | 55 | void OffsetAdjuster::UnadjustOffsets( |
| 56 | const Adjustments& adjustments, |
| 57 | std::vector<size_t>* offsets_for_unadjustment) { |
| 58 | if (!offsets_for_unadjustment || adjustments.empty()) |
| 59 | return; |
| 60 | for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin()); |
| 61 | i != offsets_for_unadjustment->end(); ++i) |
| 62 | UnadjustOffset(adjustments, &(*i)); |
| 63 | } |
| 64 | |
| 65 | // static |
| 66 | void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, |
| 67 | size_t* offset) { |
| 68 | if (*offset == string16::npos) |
| 69 | return; |
| 70 | int adjustment = 0; |
| 71 | for (Adjustments::const_iterator i = adjustments.begin(); |
| 72 | i != adjustments.end(); ++i) { |
| 73 | if (*offset + adjustment <= i->original_offset) |
| 74 | break; |
| 75 | adjustment += static_cast<int>(i->original_length - i->output_length); |
| 76 | if ((*offset + adjustment) < |
| 77 | (i->original_offset + i->original_length)) { |
| 78 | *offset = string16::npos; |
| 79 | return; |
| 80 | } |
| 81 | } |
| 82 | *offset += adjustment; |
| 83 | } |
| 84 | |
| 85 | // static |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 86 | void OffsetAdjuster::MergeSequentialAdjustments( |
| 87 | const Adjustments& first_adjustments, |
| 88 | Adjustments* adjustments_on_adjusted_string) { |
| 89 | Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin(); |
| 90 | Adjustments::const_iterator first_iter = first_adjustments.begin(); |
| 91 | // Simultaneously iterate over all |adjustments_on_adjusted_string| and |
| 92 | // |first_adjustments|, adding adjustments to or correcting the adjustments |
| 93 | // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the |
| 94 | // current number of characters collapsed by |first_adjustments| up to this |
| 95 | // point. |currently_collapsing| keeps track of the number of characters |
| 96 | // collapsed by |first_adjustments| into the current |adjusted_iter|'s |
| 97 | // length. These are characters that will change |shift| as soon as we're |
| 98 | // done processing the current |adjusted_iter|; they are not yet reflected in |
| 99 | // |shift|. |
| 100 | size_t shift = 0; |
| 101 | size_t currently_collapsing = 0; |
| 102 | while (adjusted_iter != adjustments_on_adjusted_string->end()) { |
| 103 | if ((first_iter == first_adjustments.end()) || |
| 104 | ((adjusted_iter->original_offset + shift + |
| 105 | adjusted_iter->original_length) <= first_iter->original_offset)) { |
| 106 | // Entire |adjusted_iter| (accounting for its shift and including its |
| 107 | // whole original length) comes before |first_iter|. |
| 108 | // |
| 109 | // Correct the offset at |adjusted_iter| and move onto the next |
| 110 | // adjustment that needs revising. |
| 111 | adjusted_iter->original_offset += shift; |
| 112 | shift += currently_collapsing; |
| 113 | currently_collapsing = 0; |
| 114 | ++adjusted_iter; |
| 115 | } else if ((adjusted_iter->original_offset + shift) > |
| 116 | first_iter->original_offset) { |
| 117 | // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). |
| 118 | |
| 119 | // It's not possible for the adjustments to overlap. (It shouldn't |
| 120 | // be possible that we have an |adjusted_iter->original_offset| that, |
| 121 | // when adjusted by the computed |shift|, is in the middle of |
[email protected] | e281976 | 2014-04-23 04:28:38 | [diff] [blame] | 122 | // |first_iter|'s output's length. After all, that would mean the |
| 123 | // current adjustment_on_adjusted_string somehow points to an offset |
| 124 | // that was supposed to have been eliminated by the first set of |
| 125 | // adjustments.) |
| 126 | DCHECK_LE(first_iter->original_offset + first_iter->output_length, |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 127 | adjusted_iter->original_offset + shift); |
| 128 | |
| 129 | // Add the |first_adjustment_iter| to the full set of adjustments while |
| 130 | // making sure |adjusted_iter| continues pointing to the same element. |
| 131 | // We do this by inserting the |first_adjustment_iter| right before |
| 132 | // |adjusted_iter|, then incrementing |adjusted_iter| so it points to |
| 133 | // the following element. |
| 134 | shift += first_iter->original_length - first_iter->output_length; |
| 135 | adjusted_iter = adjustments_on_adjusted_string->insert( |
| 136 | adjusted_iter, *first_iter); |
| 137 | ++adjusted_iter; |
| 138 | ++first_iter; |
| 139 | } else { |
| 140 | // The first adjustment adjusted something that then got further adjusted |
| 141 | // by the second set of adjustments. In other words, |first_iter| points |
| 142 | // to something in the range covered by |adjusted_iter|'s length (after |
| 143 | // accounting for |shift|). Precisely, |
| 144 | // adjusted_iter->original_offset + shift |
| 145 | // <= |
| 146 | // first_iter->original_offset |
| 147 | // <= |
| 148 | // adjusted_iter->original_offset + shift + |
| 149 | // adjusted_iter->original_length |
| 150 | |
| 151 | // Modify the current |adjusted_iter| to include whatever collapsing |
| 152 | // happened in |first_iter|, then advance to the next |first_adjustments| |
| 153 | // because we dealt with the current one. |
| 154 | const int collapse = static_cast<int>(first_iter->original_length) - |
| 155 | static_cast<int>(first_iter->output_length); |
| 156 | // This function does not know how to deal with a string that expands and |
| 157 | // then gets modified, only strings that collapse and then get modified. |
| 158 | DCHECK_GT(collapse, 0); |
| 159 | adjusted_iter->original_length += collapse; |
| 160 | currently_collapsing += collapse; |
| 161 | ++first_iter; |
| 162 | } |
| 163 | } |
| 164 | DCHECK_EQ(0u, currently_collapsing); |
| 165 | if (first_iter != first_adjustments.end()) { |
| 166 | // Only first adjustments are left. These do not need to be modified. |
| 167 | // (Their offsets are already correct with respect to the original string.) |
| 168 | // Append them all. |
| 169 | DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); |
| 170 | adjustments_on_adjusted_string->insert( |
| 171 | adjustments_on_adjusted_string->end(), first_iter, |
| 172 | first_adjustments.end()); |
| 173 | } |
| 174 | } |
| 175 | |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 176 | // Converts the given source Unicode character type to the given destination |
| 177 | // Unicode character type as a STL string. The given input buffer and size |
| 178 | // determine the source, and the given output STL string will be replaced by |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 179 | // the result. If non-NULL, |adjustments| is set to reflect the all the |
| 180 | // alterations to the string that are not one-character-to-one-character. |
| 181 | // It will always be sorted by increasing offset. |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 182 | template<typename SrcChar, typename DestStdString> |
| 183 | bool ConvertUnicode(const SrcChar* src, |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 184 | size_t src_len, |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 185 | DestStdString* output, |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 186 | OffsetAdjuster::Adjustments* adjustments) { |
| 187 | if (adjustments) |
| 188 | adjustments->clear(); |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 189 | // ICU requires 32-bit numbers. |
| 190 | bool success = true; |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 191 | int32 src_len32 = static_cast<int32>(src_len); |
| 192 | for (int32 i = 0; i < src_len32; i++) { |
| 193 | uint32 code_point; |
| 194 | size_t original_i = i; |
| 195 | size_t chars_written = 0; |
| 196 | if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { |
| 197 | chars_written = WriteUnicodeCharacter(code_point, output); |
| 198 | } else { |
[email protected] | d7a3e8e | 2010-01-01 22:16:38 | [diff] [blame] | 199 | chars_written = WriteUnicodeCharacter(0xFFFD, output); |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 200 | success = false; |
| 201 | } |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 202 | |
| 203 | // Only bother writing an adjustment if this modification changed the |
| 204 | // length of this character. |
| 205 | // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last |
| 206 | // character read, not after it (so that incrementing it in the loop |
| 207 | // increment will place it at the right location), so we need to account |
| 208 | // for that in determining the amount that was read. |
| 209 | if (adjustments && ((i - original_i + 1) != chars_written)) { |
| 210 | adjustments->push_back(OffsetAdjuster::Adjustment( |
| 211 | original_i, i - original_i + 1, chars_written)); |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 212 | } |
| 213 | } |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 214 | return success; |
| 215 | } |
| 216 | |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 217 | bool UTF8ToUTF16WithAdjustments( |
| 218 | const char* src, |
| 219 | size_t src_len, |
| 220 | string16* output, |
| 221 | base::OffsetAdjuster::Adjustments* adjustments) { |
[email protected] | b9f9383 | 2009-11-13 19:27:48 | [diff] [blame] | 222 | PrepareForUTF16Or32Output(src, src_len, output); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 223 | return ConvertUnicode(src, src_len, output, adjustments); |
[email protected] | 421de2a | 2011-04-13 18:43:05 | [diff] [blame] | 224 | } |
| 225 | |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 226 | string16 UTF8ToUTF16WithAdjustments( |
| 227 | const base::StringPiece& utf8, |
| 228 | base::OffsetAdjuster::Adjustments* adjustments) { |
[email protected] | 04866c4 | 2011-05-03 20:03:50 | [diff] [blame] | 229 | string16 result; |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 230 | UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); |
[email protected] | 421de2a | 2011-04-13 18:43:05 | [diff] [blame] | 231 | return result; |
| 232 | } |
| 233 | |
[email protected] | 04866c4 | 2011-05-03 20:03:50 | [diff] [blame] | 234 | string16 UTF8ToUTF16AndAdjustOffsets( |
| 235 | const base::StringPiece& utf8, |
[email protected] | 421de2a | 2011-04-13 18:43:05 | [diff] [blame] | 236 | std::vector<size_t>* offsets_for_adjustment) { |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 237 | std::for_each(offsets_for_adjustment->begin(), |
| 238 | offsets_for_adjustment->end(), |
| 239 | LimitOffset<base::StringPiece>(utf8.length())); |
| 240 | OffsetAdjuster::Adjustments adjustments; |
| 241 | string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); |
| 242 | OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 243 | return result; |
| 244 | } |
| 245 | |
| 246 | std::string UTF16ToUTF8AndAdjustOffsets( |
| 247 | const base::StringPiece16& utf16, |
| 248 | std::vector<size_t>* offsets_for_adjustment) { |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 249 | std::for_each(offsets_for_adjustment->begin(), |
| 250 | offsets_for_adjustment->end(), |
| 251 | LimitOffset<base::StringPiece16>(utf16.length())); |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 252 | std::string result; |
| 253 | PrepareForUTF8Output(utf16.data(), utf16.length(), &result); |
[email protected] | a97376e | 2014-04-18 20:54:44 | [diff] [blame] | 254 | OffsetAdjuster::Adjustments adjustments; |
| 255 | ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); |
| 256 | OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); |
[email protected] | cbf35e17 | 2011-09-08 02:18:10 | [diff] [blame] | 257 | return result; |
| 258 | } |
| 259 | |
[email protected] | a3f7218 | 2013-02-07 03:59:06 | [diff] [blame] | 260 | } // namespace base |