[email protected] | 847aaab8 | 2014-05-07 14:05:46 | [diff] [blame] | 1 | // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
avi | c0c6031 | 2015-12-21 21:03:50 | [diff] [blame] | 5 | #include <stddef.h> |
| 6 | |
Avi Drissman | a92b3be | 2018-12-24 21:55:29 | [diff] [blame] | 7 | #include "base/stl_util.h" |
[email protected] | 847aaab8 | 2014-05-07 14:05:46 | [diff] [blame] | 8 | #include "testing/gtest/include/gtest/gtest.h" |
| 9 | #include "third_party/icu/source/common/unicode/ucnv.h" |
| 10 | #include "url/url_canon.h" |
| 11 | #include "url/url_canon_icu.h" |
| 12 | #include "url/url_canon_stdstring.h" |
| 13 | #include "url/url_test_utils.h" |
| 14 | |
[email protected] | 847aaab8 | 2014-05-07 14:05:46 | [diff] [blame] | 15 | namespace url { |
| 16 | |
[email protected] | 847aaab8 | 2014-05-07 14:05:46 | [diff] [blame] | 17 | namespace { |
| 18 | |
| 19 | // Wrapper around a UConverter object that managers creation and destruction. |
| 20 | class UConvScoper { |
| 21 | public: |
| 22 | explicit UConvScoper(const char* charset_name) { |
| 23 | UErrorCode err = U_ZERO_ERROR; |
| 24 | converter_ = ucnv_open(charset_name, &err); |
| 25 | } |
| 26 | |
| 27 | ~UConvScoper() { |
| 28 | if (converter_) |
| 29 | ucnv_close(converter_); |
| 30 | } |
| 31 | |
| 32 | // Returns the converter object, may be NULL. |
| 33 | UConverter* converter() const { return converter_; } |
| 34 | |
| 35 | private: |
| 36 | UConverter* converter_; |
| 37 | }; |
| 38 | |
| 39 | TEST(URLCanonIcuTest, ICUCharsetConverter) { |
| 40 | struct ICUCase { |
| 41 | const wchar_t* input; |
| 42 | const char* encoding; |
| 43 | const char* expected; |
| 44 | } icu_cases[] = { |
| 45 | // UTF-8. |
| 46 | {L"Hello, world", "utf-8", "Hello, world"}, |
| 47 | {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"}, |
| 48 | // Non-BMP UTF-8. |
| 49 | {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"}, |
| 50 | // Big5 |
| 51 | {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"}, |
| 52 | // Unrepresentable character in the destination set. |
| 53 | {L"hello\x4f60\x06de\x597dworld", "big5", |
| 54 | "hello\xa7\x41%26%231758%3B\xa6\x6eworld"}, |
| 55 | }; |
| 56 | |
Avi Drissman | a92b3be | 2018-12-24 21:55:29 | [diff] [blame] | 57 | for (size_t i = 0; i < base::size(icu_cases); i++) { |
[email protected] | 847aaab8 | 2014-05-07 14:05:46 | [diff] [blame] | 58 | UConvScoper conv(icu_cases[i].encoding); |
| 59 | ASSERT_TRUE(conv.converter() != NULL); |
| 60 | ICUCharsetConverter converter(conv.converter()); |
| 61 | |
| 62 | std::string str; |
| 63 | StdStringCanonOutput output(&str); |
| 64 | |
brettw | 1b8582f | 2016-11-03 20:37:17 | [diff] [blame] | 65 | base::string16 input_str( |
| 66 | test_utils::TruncateWStringToUTF16(icu_cases[i].input)); |
[email protected] | 847aaab8 | 2014-05-07 14:05:46 | [diff] [blame] | 67 | int input_len = static_cast<int>(input_str.length()); |
| 68 | converter.ConvertFromUTF16(input_str.c_str(), input_len, &output); |
| 69 | output.Complete(); |
| 70 | |
| 71 | EXPECT_STREQ(icu_cases[i].expected, str.c_str()); |
| 72 | } |
| 73 | |
| 74 | // Test string sizes around the resize boundary for the output to make sure |
| 75 | // the converter resizes as needed. |
| 76 | const int static_size = 16; |
| 77 | UConvScoper conv("utf-8"); |
| 78 | ASSERT_TRUE(conv.converter()); |
| 79 | ICUCharsetConverter converter(conv.converter()); |
| 80 | for (int i = static_size - 2; i <= static_size + 2; i++) { |
| 81 | // Make a string with the appropriate length. |
| 82 | base::string16 input; |
| 83 | for (int ch = 0; ch < i; ch++) |
| 84 | input.push_back('a'); |
| 85 | |
| 86 | RawCanonOutput<static_size> output; |
| 87 | converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()), |
| 88 | &output); |
| 89 | EXPECT_EQ(input.length(), static_cast<size_t>(output.length())); |
| 90 | } |
| 91 | } |
| 92 | |
| 93 | TEST(URLCanonIcuTest, QueryWithConverter) { |
| 94 | struct QueryCase { |
| 95 | const char* input8; |
| 96 | const wchar_t* input16; |
| 97 | const char* encoding; |
| 98 | const char* expected; |
| 99 | } query_cases[] = { |
| 100 | // Regular ASCII case in some different encodings. |
| 101 | {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"}, |
| 102 | {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"}, |
| 103 | {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"}, |
| 104 | // Chinese input/output |
| 105 | {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312", |
| 106 | "?q=%C4%E3%BA%C3"}, |
| 107 | {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"}, |
| 108 | // Unencodable character in the destination character set should be |
| 109 | // escaped. The escape sequence unescapes to be the entity name: |
| 110 | // "?q=你" |
| 111 | {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1", |
| 112 | "?q=Chinese%26%2365319%3B"}, |
| 113 | }; |
| 114 | |
Avi Drissman | a92b3be | 2018-12-24 21:55:29 | [diff] [blame] | 115 | for (size_t i = 0; i < base::size(query_cases); i++) { |
[email protected] | 847aaab8 | 2014-05-07 14:05:46 | [diff] [blame] | 116 | Component out_comp; |
| 117 | |
| 118 | UConvScoper conv(query_cases[i].encoding); |
| 119 | ASSERT_TRUE(!query_cases[i].encoding || conv.converter()); |
| 120 | ICUCharsetConverter converter(conv.converter()); |
| 121 | |
| 122 | if (query_cases[i].input8) { |
| 123 | int len = static_cast<int>(strlen(query_cases[i].input8)); |
| 124 | Component in_comp(0, len); |
| 125 | std::string out_str; |
| 126 | |
| 127 | StdStringCanonOutput output(&out_str); |
| 128 | CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output, |
| 129 | &out_comp); |
| 130 | output.Complete(); |
| 131 | |
| 132 | EXPECT_EQ(query_cases[i].expected, out_str); |
| 133 | } |
| 134 | |
| 135 | if (query_cases[i].input16) { |
brettw | 1b8582f | 2016-11-03 20:37:17 | [diff] [blame] | 136 | base::string16 input16( |
| 137 | test_utils::TruncateWStringToUTF16(query_cases[i].input16)); |
[email protected] | 847aaab8 | 2014-05-07 14:05:46 | [diff] [blame] | 138 | int len = static_cast<int>(input16.length()); |
| 139 | Component in_comp(0, len); |
| 140 | std::string out_str; |
| 141 | |
| 142 | StdStringCanonOutput output(&out_str); |
| 143 | CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output, |
| 144 | &out_comp); |
| 145 | output.Complete(); |
| 146 | |
| 147 | EXPECT_EQ(query_cases[i].expected, out_str); |
| 148 | } |
| 149 | } |
| 150 | |
| 151 | // Extra test for input with embedded NULL; |
| 152 | std::string out_str; |
| 153 | StdStringCanonOutput output(&out_str); |
| 154 | Component out_comp; |
| 155 | CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp); |
| 156 | output.Complete(); |
| 157 | EXPECT_EQ("?a%20%00z%01", out_str); |
| 158 | } |
| 159 | |
| 160 | } // namespace |
| 161 | |
| 162 | } // namespace url |