URLPatternIndex: Allow separator placeholder to match the end of url.
This CL modifies the URLPatternIndex matching algorithm to ensure that the
separator placeholder (^) also matches the end of the text, thus fixing an
existing TODO.
BUG=772260
Change-Id: If6917c1ea4c7a037765ca421075bf298f64c5ceb
Reviewed-on: https://chromium-review.googlesource.com/c/1476814
Commit-Queue: Karan Bhatia <[email protected]>
Reviewed-by: Istiaque Ahmed <[email protected]>
Reviewed-by: Charlie Harrison <[email protected]>
Cr-Commit-Position: refs/heads/master@{#633869}
diff --git a/components/url_pattern_index/fuzzy_pattern_matching.h b/components/url_pattern_index/fuzzy_pattern_matching.h
index 52dc75b..3feb7701 100644
--- a/components/url_pattern_index/fuzzy_pattern_matching.h
+++ b/components/url_pattern_index/fuzzy_pattern_matching.h
@@ -6,8 +6,10 @@
// separator character, which is any ASCII symbol except letters, digits, and
// the following: '_', '-', '.', '%'. Note that the separator placeholder
// character '^' is itself a separator, as well as '\0'.
-// TODO(pkalinnikov): In addition, a separator placeholder at the end of the
-// pattern can be matched by the end of |text|.
+//
+// In addition, a separator placeholder at the end of the pattern can be matched
+// by the end of |text|. This should be handled by the clients using the
+// following utility functions.
//
// We define a fuzzy occurrence as an occurrence of a |subpattern| in |text|
// such that all its non-placeholder characters are equal to the corresponding
diff --git a/components/url_pattern_index/url_pattern.cc b/components/url_pattern_index/url_pattern.cc
index a9762fa..81b80bfb 100644
--- a/components/url_pattern_index/url_pattern.cc
+++ b/components/url_pattern_index/url_pattern.cc
@@ -144,6 +144,80 @@
return base::StringPiece::npos;
}
+// Helper for DoesTextMatchLastSubpattern. Treats kSeparatorPlaceholder as not
+// matching the end of the text.
+bool DoesTextMatchLastSubpatternInternal(proto::AnchorType anchor_left,
+ proto::AnchorType anchor_right,
+ base::StringPiece text,
+ url::Component url_host,
+ base::StringPiece subpattern) {
+ // Enumerate all possible combinations of |anchor_left| and |anchor_right|.
+ if (anchor_left == proto::ANCHOR_TYPE_NONE &&
+ anchor_right == proto::ANCHOR_TYPE_NONE) {
+ return FindSubpattern(text, subpattern) != base::StringPiece::npos;
+ }
+
+ if (anchor_left == proto::ANCHOR_TYPE_NONE &&
+ anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
+ return EndsWithFuzzy(text, subpattern);
+ }
+
+ if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY &&
+ anchor_right == proto::ANCHOR_TYPE_NONE) {
+ return StartsWithFuzzy(text, subpattern);
+ }
+
+ if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY &&
+ anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
+ return text.size() == subpattern.size() &&
+ StartsWithFuzzy(text, subpattern);
+ }
+
+ if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN &&
+ anchor_right == proto::ANCHOR_TYPE_NONE) {
+ return url_host.is_nonempty() &&
+ FindSubdomainAnchoredSubpattern(text, url_host, subpattern) !=
+ base::StringPiece::npos;
+ }
+
+ if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN &&
+ anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
+ return url_host.is_nonempty() && text.size() >= subpattern.size() &&
+ IsSubdomainAnchored(text, url_host,
+ text.size() - subpattern.size()) &&
+ EndsWithFuzzy(text, subpattern);
+ }
+
+ NOTREACHED();
+ return false;
+}
+
+// Matches the last |subpattern| against |text|. Special treatment is required
+// for the last subpattern since |kSeparatorPlaceholder| can also match the end
+// of the text.
+bool DoesTextMatchLastSubpattern(proto::AnchorType anchor_left,
+ proto::AnchorType anchor_right,
+ base::StringPiece text,
+ url::Component url_host,
+ base::StringPiece subpattern) {
+ DCHECK(!subpattern.empty());
+
+ if (DoesTextMatchLastSubpatternInternal(anchor_left, anchor_right, text,
+ url_host, subpattern)) {
+ return true;
+ }
+
+ // If the last |subpattern| ends with kSeparatorPlaceholder, then it can also
+ // match the end of text.
+ if (subpattern.back() == kSeparatorPlaceholder) {
+ subpattern.remove_suffix(1);
+ return DoesTextMatchLastSubpatternInternal(
+ anchor_left, proto::ANCHOR_TYPE_BOUNDARY, text, url_host, subpattern);
+ }
+
+ return false;
+}
+
// Returns whether the given |url_pattern| matches the given |url_spec|.
// Compares the pattern the the url in a case-sensitive manner.
bool IsCaseSensitiveMatch(base::StringPiece url_pattern,
@@ -157,6 +231,7 @@
auto subpattern_it = subpatterns.begin();
auto subpattern_end = subpatterns.end();
+ // No subpatterns.
if (subpattern_it == subpattern_end) {
return anchor_left == proto::ANCHOR_TYPE_NONE ||
anchor_right == proto::ANCHOR_TYPE_NONE;
@@ -165,22 +240,10 @@
base::StringPiece subpattern = *subpattern_it;
++subpattern_it;
- // If there is only one |subpattern|, and it has a right anchor, then simply
- // check that it is a suffix of the |url_spec|, and the left anchor is
- // fulfilled.
- if (subpattern_it == subpattern_end &&
- anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
- if (!EndsWithFuzzy(url_spec, subpattern))
- return false;
- if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY)
- return url_spec.size() == subpattern.size();
- if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
- DCHECK_LE(subpattern.size(), url_spec.size());
- return url_host.is_nonempty() &&
- IsSubdomainAnchored(url_spec, url_host,
- url_spec.size() - subpattern.size());
- }
- return true;
+ // There is only one |subpattern|.
+ if (subpattern_it == subpattern_end) {
+ return DoesTextMatchLastSubpattern(anchor_left, anchor_right, url_spec,
+ url_host, subpattern);
}
// Otherwise, the first |subpattern| does not have to be a suffix. But it
@@ -189,10 +252,6 @@
if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY) {
if (!StartsWithFuzzy(url_spec, subpattern))
return false;
- if (subpattern_it == subpattern_end) {
- DCHECK_EQ(anchor_right, proto::ANCHOR_TYPE_NONE);
- return true;
- }
text.remove_prefix(subpattern.size());
} else if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
if (!url_host.is_nonempty())
@@ -201,10 +260,6 @@
FindSubdomainAnchoredSubpattern(url_spec, url_host, subpattern);
if (match_begin == base::StringPiece::npos)
return false;
- if (subpattern_it == subpattern_end) {
- DCHECK_EQ(anchor_right, proto::ANCHOR_TYPE_NONE);
- return true;
- }
text.remove_prefix(match_begin + subpattern.size());
} else {
DCHECK_EQ(anchor_left, proto::ANCHOR_TYPE_NONE);
@@ -212,26 +267,24 @@
subpattern_it = subpatterns.begin();
}
- // Consecutively find all the remaining subpatterns in the |text|. If the
- // pattern has a right anchor, don't search for the last subpattern, but
- // instead check that it is a suffix of the |text|.
- while (subpattern_it != subpattern_end) {
- subpattern = *subpattern_it;
- DCHECK(!subpattern.empty());
+ DCHECK(subpattern_it != subpattern_end);
+ subpattern = *subpattern_it;
- if (++subpattern_it == subpattern_end &&
- anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
- break;
- }
+ // Consecutively find all the remaining subpatterns in the |text|. Handle the
+ // last subpattern outside the loop.
+ while (++subpattern_it != subpattern_end) {
+ DCHECK(!subpattern.empty());
const size_t match_position = FindSubpattern(text, subpattern);
if (match_position == base::StringPiece::npos)
return false;
text.remove_prefix(match_position + subpattern.size());
+
+ subpattern = *subpattern_it;
}
- return anchor_right != proto::ANCHOR_TYPE_BOUNDARY ||
- EndsWithFuzzy(text, subpattern);
+ return DoesTextMatchLastSubpattern(proto::ANCHOR_TYPE_NONE, anchor_right,
+ text, url::Component(), subpattern);
}
} // namespace
diff --git a/components/url_pattern_index/url_pattern_unittest.cc b/components/url_pattern_index/url_pattern_unittest.cc
index d067d12..950c907 100644
--- a/components/url_pattern_index/url_pattern_unittest.cc
+++ b/components/url_pattern_index/url_pattern_unittest.cc
@@ -90,7 +90,9 @@
{{"^^a^^"}, "http://ex.com/?a=/", true},
{{"^^a^^"}, "http://ex.com/?a=/&b=0", true},
- {{"^^a^^"}, "http://ex.com/?a=", false},
+ {{"^^a^^"}, "http://ex.com/?a=x", false},
+ // The last ^ matches the end of the url.
+ {{"^^a^^"}, "http://ex.com/?a=", true},
{{"ex.com^path^*k=v^"}, "http://ex.com/path/?k1=v1&ak=v&kk=vv", true},
{{"ex.com^path^*k=v^"}, "http://ex.com/p/path/?k1=v1&ak=v&kk=vv", false},
@@ -152,6 +154,38 @@
{{"abc*def^", proto::URL_PATTERN_TYPE_WILDCARDED, kDonotMatchCase},
"http://a.com/abcxAdef/vo",
true},
+ {{"abc^", kAnchorNone, kAnchorNone}, "https://xyz.com/abc/123", true},
+ {{"abc^", kAnchorNone, kAnchorNone}, "https://xyz.com/abc", true},
+ {{"abc^", kAnchorNone, kAnchorNone}, "https://abc.com", false},
+ {{"abc^", kAnchorNone, kBoundary}, "https://xyz.com/abc/", true},
+ {{"abc^", kAnchorNone, kBoundary}, "https://xyz.com/abc", true},
+ {{"abc^", kAnchorNone, kBoundary}, "https://xyz.com/abc/123", false},
+ {{"http://abc.com/x^", kBoundary, kAnchorNone}, "http://abc.com/x", true},
+ {{"http://abc.com/x^", kBoundary, kAnchorNone},
+ "http://abc.com/x/",
+ true},
+ {{"http://abc.com/x^", kBoundary, kAnchorNone},
+ "http://abc.com/x/123",
+ true},
+ {{"http://abc.com/x^", kBoundary, kBoundary}, "http://abc.com/x", true},
+ {{"http://abc.com/x^", kBoundary, kBoundary}, "http://abc.com/x/", true},
+ {{"http://abc.com/x^", kBoundary, kBoundary},
+ "http://abc.com/x/123",
+ false},
+ {{"abc.com^", kSubdomain, kAnchorNone}, "http://xyz.abc.com/123", true},
+ {{"abc.com^", kSubdomain, kAnchorNone}, "http://xyz.abc.com", true},
+ {{"abc.com^", kSubdomain, kAnchorNone},
+ "http://abc.com.xyz.com?q=abc.com",
+ false},
+ {{"abc.com^", kSubdomain, kBoundary}, "http://xyz.abc.com/123", false},
+ {{"abc.com^", kSubdomain, kBoundary}, "http://xyz.abc.com", true},
+ {{"abc.com^", kSubdomain, kBoundary},
+ "http://abc.com.xyz.com?q=abc.com/",
+ false},
+ {{"abc*^", kAnchorNone, kAnchorNone}, "https://abc.com", true},
+ {{"abc*^", kAnchorNone, kAnchorNone}, "https://abc.com?q=123", true},
+ {{"abc*^", kAnchorNone, kBoundary}, "https://abc.com", true},
+ {{"abc*^", kAnchorNone, kBoundary}, "https://abc.com?q=123", true},
};
for (const auto& test_case : kTestCases) {