blob: 62ec1a9661e5dfc349e9d0678e13734906c24335 [file] [log] [blame]
[email protected]847aaab82014-05-07 14:05:461// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
avic0c60312015-12-21 21:03:505#include <stddef.h>
6
Avi Drissmana92b3be2018-12-24 21:55:297#include "base/stl_util.h"
[email protected]847aaab82014-05-07 14:05:468#include "testing/gtest/include/gtest/gtest.h"
9#include "third_party/icu/source/common/unicode/ucnv.h"
10#include "url/url_canon.h"
11#include "url/url_canon_icu.h"
12#include "url/url_canon_stdstring.h"
13#include "url/url_test_utils.h"
14
[email protected]847aaab82014-05-07 14:05:4615namespace url {
16
[email protected]847aaab82014-05-07 14:05:4617namespace {
18
19// Wrapper around a UConverter object that managers creation and destruction.
20class UConvScoper {
21 public:
22 explicit UConvScoper(const char* charset_name) {
23 UErrorCode err = U_ZERO_ERROR;
24 converter_ = ucnv_open(charset_name, &err);
25 }
26
27 ~UConvScoper() {
28 if (converter_)
29 ucnv_close(converter_);
30 }
31
32 // Returns the converter object, may be NULL.
33 UConverter* converter() const { return converter_; }
34
35 private:
36 UConverter* converter_;
37};
38
39TEST(URLCanonIcuTest, ICUCharsetConverter) {
40 struct ICUCase {
41 const wchar_t* input;
42 const char* encoding;
43 const char* expected;
44 } icu_cases[] = {
45 // UTF-8.
46 {L"Hello, world", "utf-8", "Hello, world"},
47 {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
48 // Non-BMP UTF-8.
49 {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
50 // Big5
51 {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
52 // Unrepresentable character in the destination set.
53 {L"hello\x4f60\x06de\x597dworld", "big5",
54 "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
55 };
56
Avi Drissmana92b3be2018-12-24 21:55:2957 for (size_t i = 0; i < base::size(icu_cases); i++) {
[email protected]847aaab82014-05-07 14:05:4658 UConvScoper conv(icu_cases[i].encoding);
59 ASSERT_TRUE(conv.converter() != NULL);
60 ICUCharsetConverter converter(conv.converter());
61
62 std::string str;
63 StdStringCanonOutput output(&str);
64
brettw1b8582f2016-11-03 20:37:1765 base::string16 input_str(
66 test_utils::TruncateWStringToUTF16(icu_cases[i].input));
[email protected]847aaab82014-05-07 14:05:4667 int input_len = static_cast<int>(input_str.length());
68 converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
69 output.Complete();
70
71 EXPECT_STREQ(icu_cases[i].expected, str.c_str());
72 }
73
74 // Test string sizes around the resize boundary for the output to make sure
75 // the converter resizes as needed.
76 const int static_size = 16;
77 UConvScoper conv("utf-8");
78 ASSERT_TRUE(conv.converter());
79 ICUCharsetConverter converter(conv.converter());
80 for (int i = static_size - 2; i <= static_size + 2; i++) {
81 // Make a string with the appropriate length.
82 base::string16 input;
83 for (int ch = 0; ch < i; ch++)
84 input.push_back('a');
85
86 RawCanonOutput<static_size> output;
87 converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
88 &output);
89 EXPECT_EQ(input.length(), static_cast<size_t>(output.length()));
90 }
91}
92
93TEST(URLCanonIcuTest, QueryWithConverter) {
94 struct QueryCase {
95 const char* input8;
96 const wchar_t* input16;
97 const char* encoding;
98 const char* expected;
99 } query_cases[] = {
100 // Regular ASCII case in some different encodings.
101 {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
102 {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
103 {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
104 // Chinese input/output
105 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
106 "?q=%C4%E3%BA%C3"},
107 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
108 // Unencodable character in the destination character set should be
109 // escaped. The escape sequence unescapes to be the entity name:
110 // "?q=&#20320;"
111 {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
112 "?q=Chinese%26%2365319%3B"},
113 };
114
Avi Drissmana92b3be2018-12-24 21:55:29115 for (size_t i = 0; i < base::size(query_cases); i++) {
[email protected]847aaab82014-05-07 14:05:46116 Component out_comp;
117
118 UConvScoper conv(query_cases[i].encoding);
119 ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
120 ICUCharsetConverter converter(conv.converter());
121
122 if (query_cases[i].input8) {
123 int len = static_cast<int>(strlen(query_cases[i].input8));
124 Component in_comp(0, len);
125 std::string out_str;
126
127 StdStringCanonOutput output(&out_str);
128 CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
129 &out_comp);
130 output.Complete();
131
132 EXPECT_EQ(query_cases[i].expected, out_str);
133 }
134
135 if (query_cases[i].input16) {
brettw1b8582f2016-11-03 20:37:17136 base::string16 input16(
137 test_utils::TruncateWStringToUTF16(query_cases[i].input16));
[email protected]847aaab82014-05-07 14:05:46138 int len = static_cast<int>(input16.length());
139 Component in_comp(0, len);
140 std::string out_str;
141
142 StdStringCanonOutput output(&out_str);
143 CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
144 &out_comp);
145 output.Complete();
146
147 EXPECT_EQ(query_cases[i].expected, out_str);
148 }
149 }
150
151 // Extra test for input with embedded NULL;
152 std::string out_str;
153 StdStringCanonOutput output(&out_str);
154 Component out_comp;
155 CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
156 output.Complete();
157 EXPECT_EQ("?a%20%00z%01", out_str);
158}
159
160} // namespace
161
162} // namespace url