URLPattern: Canonicalize pattern encoding.
This CL adds an encoding callback to liburlpattern::Parse(). The
parse will invoke the given callback for plaintext parts of the pattern
to validate and encode the characters. This callback mechanism is then
used to apply the chromium url canonicalization code for each component
pattern.
There are a couple of behaviors in the canonicalizer that do not play
well with this approach that the CL works around:
1. The port canonicalizer will replace an exact default port with the
empty string. Since the liburlpattern::Parse() callback is invoked
for partial values this CL instead implements this canoncilization
separately before pattern compilation.
2. The URL canonicalizer will prepend a leading `/` character if there
isn't one. Again, this behavior does not make sense when operating
on partial values. Therefore this CL exposes the internal partial
path canonicalization routine so that we can use it in URLPattern.
In addition, this CL removes a DCHECK from url's DoPartialPath() that
asserted there was always a character preceding a dot. The DCHECK has
had a runtime check checking the same behavior since 2013 so it seems
safe to remove the DCHECK. And in this case we want to be able to
run the canonicalize partial paths that do start with dots.
The CL adds a number of additional WPT test cases validating the new
canonicalization behavior.
The behavior in this test has been discussed in this spec issue:
https://github.com/WICG/urlpattern/issues/33
Bug: 1141510
Change-Id: I388be5d0cc57b125d44465b283050df5ed0b5321
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2720702
Reviewed-by: Jeremy Roman <[email protected]>
Reviewed-by: Charlie Harrison <[email protected]>
Commit-Queue: Ben Kelly <[email protected]>
Cr-Commit-Position: refs/heads/master@{#860399}
diff --git a/url/url_canon_path.cc b/url/url_canon_path.cc
index c457ea3..ecd6131 100644
--- a/url/url_canon_path.cc
+++ b/url/url_canon_path.cc
@@ -20,7 +20,8 @@
// table below more clear when neither ESCAPE or UNESCAPE is set.
PASS = 0,
- // This character requires special handling in DoPartialPath. Doing this test
+ // This character requires special handling in DoPartialPathInternal. Doing
+ // this test
// first allows us to filter out the common cases of regular characters that
// can be directly copied.
SPECIAL = 1,
@@ -235,10 +236,8 @@
}
}
-// Appends the given path to the output. It assumes that if the input path
-// starts with a slash, it should be copied to the output. If no path has
-// already been appended to the output (the case when not resolving
-// relative URLs), the path should begin with a slash.
+// Canonicalizes and appends the given path to the output. It assumes that if
+// the input path starts with a slash, it should be copied to the output.
//
// If there are already path components (this mode is used when appending
// relative paths for resolving), it assumes that the output already has
@@ -248,11 +247,11 @@
// We do not collapse multiple slashes in a row to a single slash. It seems
// no web browsers do this, and we don't want incompatibilities, even though
// it would be correct for most systems.
-template<typename CHAR, typename UCHAR>
-bool DoPartialPath(const CHAR* spec,
- const Component& path,
- int path_begin_in_output,
- CanonOutput* output) {
+template <typename CHAR, typename UCHAR>
+bool DoPartialPathInternal(const CHAR* spec,
+ const Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
int end = path.end();
// We use this variable to minimize the amount of work done when unescaping --
@@ -279,16 +278,12 @@
// Needs special handling of some sort.
int dotlen;
if ((dotlen = IsDot(spec, i, end)) > 0) {
- // See if this dot was preceded by a slash in the output. We
- // assume that when canonicalizing paths, they will always
- // start with a slash and not a dot, so we don't have to
- // bounds check the output.
+ // See if this dot was preceded by a slash in the output.
//
// Note that we check this in the case of dots so we don't have to
// special case slashes. Since slashes are much more common than
// dots, this actually increases performance measurably (though
// slightly).
- DCHECK(output->length() > path_begin_in_output);
if (output->length() > path_begin_in_output &&
output->at(output->length() - 1) == '/') {
// Slash followed by a dot, check to see if this is means relative
@@ -382,6 +377,21 @@
return success;
}
+// Perform the same logic as in DoPartialPathInternal(), but updates the
+// publicly exposed CanonOutput structure similar to DoPath(). Returns
+// true if successful.
+template <typename CHAR, typename UCHAR>
+bool DoPartialPath(const CHAR* spec,
+ const Component& path,
+ CanonOutput* output,
+ Component* out_path) {
+ out_path->begin = output->length();
+ bool success =
+ DoPartialPathInternal<CHAR, UCHAR>(spec, path, out_path->begin, output);
+ out_path->len = output->length() - out_path->begin;
+ return success;
+}
+
template<typename CHAR, typename UCHAR>
bool DoPath(const CHAR* spec,
const Component& path,
@@ -397,7 +407,8 @@
if (!IsURLSlash(spec[path.begin]))
output->push_back('/');
- success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output);
+ success =
+ DoPartialPathInternal<CHAR, UCHAR>(spec, path, out_path->begin, output);
} else {
// No input, canonical path is a slash.
output->push_back('/');
@@ -424,19 +435,33 @@
bool CanonicalizePartialPath(const char* spec,
const Component& path,
- int path_begin_in_output,
- CanonOutput* output) {
- return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output,
- output);
+ CanonOutput* output,
+ Component* out_path) {
+ return DoPartialPath<char, unsigned char>(spec, path, output, out_path);
}
bool CanonicalizePartialPath(const base::char16* spec,
const Component& path,
- int path_begin_in_output,
- CanonOutput* output) {
- return DoPartialPath<base::char16, base::char16>(spec, path,
- path_begin_in_output,
- output);
+ CanonOutput* output,
+ Component* out_path) {
+ return DoPartialPath<base::char16, base::char16>(spec, path, output,
+ out_path);
+}
+
+bool CanonicalizePartialPathInternal(const char* spec,
+ const Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
+ return DoPartialPathInternal<char, unsigned char>(
+ spec, path, path_begin_in_output, output);
+}
+
+bool CanonicalizePartialPathInternal(const base::char16* spec,
+ const Component& path,
+ int path_begin_in_output,
+ CanonOutput* output) {
+ return DoPartialPathInternal<base::char16, base::char16>(
+ spec, path, path_begin_in_output, output);
}
} // namespace url