blob: 9a6b2da31c3fa983ccd86ed3fbbae2dc4ae32ad8 [file] [log] [blame]
pkalinnikov15cf7242016-07-13 08:57:341// Copyright 2016 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
pkalinnikov35d18812017-04-05 17:28:185// The matching logic distinguishes between the terms URL pattern and
6// subpattern. A URL pattern usually stands for the full thing, e.g.
7// "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
8// of a pattern not containing the wildcard '*' character. For the example above
9// the subpatterns are: "example.com^", "path" and "par=val^".
10//
11// The separator placeholder '^' symbol is used in subpatterns to match any
12// separator character, which is any ASCII symbol except letters, digits, and
13// the following: '_', '-', '.', '%'. Note that the separator placeholder
14// character '^' is itself a separator, as well as '\0'.
15
Pavel Kalinnikovd7970632017-06-20 09:07:3416#include "components/url_pattern_index/url_pattern.h"
pkalinnikov15cf7242016-07-13 08:57:3417
pkalinnikov35d18812017-04-05 17:28:1818#include <stddef.h>
19
Karan Bhatiaa9c4e1d2018-09-10 23:37:4720#include <algorithm>
pkalinnikov35d18812017-04-05 17:28:1821#include <ostream>
22
Hans Wennborgdf87046c2020-04-28 11:06:2423#include "base/check_op.h"
24#include "base/notreached.h"
Charles Harrison78ff41bf2018-02-08 20:21:1125#include "base/numerics/checked_math.h"
Karan Bhatiaa9c4e1d2018-09-10 23:37:4726#include "base/strings/string_util.h"
Pavel Kalinnikovd7970632017-06-20 09:07:3427#include "components/url_pattern_index/flat/url_pattern_index_generated.h"
28#include "components/url_pattern_index/fuzzy_pattern_matching.h"
29#include "components/url_pattern_index/string_splitter.h"
pkalinnikov35d18812017-04-05 17:28:1830#include "url/gurl.h"
31#include "url/third_party/mozilla/url_parse.h"
pkalinnikov15cf7242016-07-13 08:57:3432
Pavel Kalinnikovd7970632017-06-20 09:07:3433namespace url_pattern_index {
pkalinnikov15cf7242016-07-13 08:57:3434
35namespace {
36
Karandeep Bhatiacf2b1a02019-02-25 23:09:3137constexpr char kWildcard = '*';
38
pkalinnikov35d18812017-04-05 17:28:1839class IsWildcard {
40 public:
Karandeep Bhatiacf2b1a02019-02-25 23:09:3141 bool operator()(char c) const { return c == kWildcard; }
pkalinnikov35d18812017-04-05 17:28:1842};
43
pkalinnikov15cf7242016-07-13 08:57:3444proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {
45 switch (type) {
46 case flat::UrlPatternType_SUBSTRING:
47 return proto::URL_PATTERN_TYPE_SUBSTRING;
48 case flat::UrlPatternType_WILDCARDED:
49 return proto::URL_PATTERN_TYPE_WILDCARDED;
50 case flat::UrlPatternType_REGEXP:
51 return proto::URL_PATTERN_TYPE_REGEXP;
52 default:
53 return proto::URL_PATTERN_TYPE_UNSPECIFIED;
54 }
55}
56
57proto::AnchorType ConvertAnchorType(flat::AnchorType type) {
58 switch (type) {
59 case flat::AnchorType_NONE:
60 return proto::ANCHOR_TYPE_NONE;
61 case flat::AnchorType_BOUNDARY:
62 return proto::ANCHOR_TYPE_BOUNDARY;
63 case flat::AnchorType_SUBDOMAIN:
64 return proto::ANCHOR_TYPE_SUBDOMAIN;
65 default:
66 return proto::ANCHOR_TYPE_UNSPECIFIED;
67 }
68}
69
70base::StringPiece ConvertString(const flatbuffers::String* string) {
71 return string ? base::StringPiece(string->data(), string->size())
72 : base::StringPiece();
73}
74
Karan Bhatiaa9c4e1d2018-09-10 23:37:4775bool HasAnyUpperAscii(base::StringPiece string) {
76 return std::any_of(string.begin(), string.end(), base::IsAsciiUpper<char>);
77}
78
pkalinnikov35d18812017-04-05 17:28:1879// Returns whether |position| within the |url| belongs to its |host| component
80// and corresponds to the beginning of a (sub-)domain.
81inline bool IsSubdomainAnchored(base::StringPiece url,
82 url::Component host,
83 size_t position) {
84 DCHECK_LE(position, url.size());
85 const size_t host_begin = static_cast<size_t>(host.begin);
86 const size_t host_end = static_cast<size_t>(host.end());
87 DCHECK_LE(host_end, url.size());
88
89 return position == host_begin ||
90 (position > host_begin && position <= host_end &&
91 url[position - 1] == '.');
92}
93
94// Returns the position of the leftmost occurrence of a |subpattern| in the
95// |text| starting no earlier than |from| the specified position. If the
96// |subpattern| has separator placeholders, searches for a fuzzy occurrence.
97size_t FindSubpattern(base::StringPiece text,
98 base::StringPiece subpattern,
99 size_t from = 0) {
100 const bool is_fuzzy =
101 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
102 return is_fuzzy ? FindFuzzy(text, subpattern, from)
103 : text.find(subpattern, from);
104}
105
106// Same as FindSubpattern(url, subpattern), but searches for an occurrence that
107// starts at the beginning of a (sub-)domain within the url's |host| component.
108size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,
109 url::Component host,
110 base::StringPiece subpattern) {
111 const bool is_fuzzy =
112 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
113
Charles Harrison78ff41bf2018-02-08 20:21:11114 // Any match found after the end of the host will be discarded, so just
115 // avoid searching there for the subpattern to begin with.
116 //
117 // Check for overflow.
118 size_t max_match_end = 0;
119 if (!base::CheckAdd(host.end(), subpattern.length())
120 .AssignIfValid(&max_match_end)) {
121 return base::StringPiece::npos;
122 }
123 const base::StringPiece url_match_candidate = url.substr(0, max_match_end);
124 const base::StringPiece url_host = url.substr(0, host.end());
125
126 for (size_t position = static_cast<size_t>(host.begin);
127 position <= static_cast<size_t>(host.end()); ++position) {
128 // Enforce as a loop precondition that we are always anchored at a
129 // sub-domain before calling find. This is to reduce the number of potential
130 // searches for |subpattern|.
131 DCHECK(IsSubdomainAnchored(url, host, position));
132
133 position = is_fuzzy ? FindFuzzy(url_match_candidate, subpattern, position)
134 : url_match_candidate.find(subpattern, position);
pkalinnikov35d18812017-04-05 17:28:18135 if (position == base::StringPiece::npos ||
136 IsSubdomainAnchored(url, host, position)) {
137 return position;
138 }
Charles Harrison78ff41bf2018-02-08 20:21:11139
140 // Enforce the loop precondition. This skips |position| to the next '.',
141 // within the host, which the loop itself increments to the anchored
142 // sub-domain.
143 position = url_host.find('.', position);
144 if (position == base::StringPiece::npos)
145 break;
pkalinnikov35d18812017-04-05 17:28:18146 }
147 return base::StringPiece::npos;
148}
149
Karan Bhatia1d1eaed22019-02-20 21:07:17150// Helper for DoesTextMatchLastSubpattern. Treats kSeparatorPlaceholder as not
151// matching the end of the text.
152bool DoesTextMatchLastSubpatternInternal(proto::AnchorType anchor_left,
153 proto::AnchorType anchor_right,
154 base::StringPiece text,
155 url::Component url_host,
156 base::StringPiece subpattern) {
157 // Enumerate all possible combinations of |anchor_left| and |anchor_right|.
158 if (anchor_left == proto::ANCHOR_TYPE_NONE &&
159 anchor_right == proto::ANCHOR_TYPE_NONE) {
160 return FindSubpattern(text, subpattern) != base::StringPiece::npos;
161 }
162
163 if (anchor_left == proto::ANCHOR_TYPE_NONE &&
164 anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
165 return EndsWithFuzzy(text, subpattern);
166 }
167
168 if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY &&
169 anchor_right == proto::ANCHOR_TYPE_NONE) {
170 return StartsWithFuzzy(text, subpattern);
171 }
172
173 if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY &&
174 anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
175 return text.size() == subpattern.size() &&
176 StartsWithFuzzy(text, subpattern);
177 }
178
179 if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN &&
180 anchor_right == proto::ANCHOR_TYPE_NONE) {
181 return url_host.is_nonempty() &&
182 FindSubdomainAnchoredSubpattern(text, url_host, subpattern) !=
183 base::StringPiece::npos;
184 }
185
186 if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN &&
187 anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
188 return url_host.is_nonempty() && text.size() >= subpattern.size() &&
189 IsSubdomainAnchored(text, url_host,
190 text.size() - subpattern.size()) &&
191 EndsWithFuzzy(text, subpattern);
192 }
193
194 NOTREACHED();
195 return false;
196}
197
198// Matches the last |subpattern| against |text|. Special treatment is required
199// for the last subpattern since |kSeparatorPlaceholder| can also match the end
200// of the text.
201bool DoesTextMatchLastSubpattern(proto::AnchorType anchor_left,
202 proto::AnchorType anchor_right,
203 base::StringPiece text,
204 url::Component url_host,
205 base::StringPiece subpattern) {
206 DCHECK(!subpattern.empty());
207
208 if (DoesTextMatchLastSubpatternInternal(anchor_left, anchor_right, text,
209 url_host, subpattern)) {
210 return true;
211 }
212
213 // If the last |subpattern| ends with kSeparatorPlaceholder, then it can also
214 // match the end of text.
215 if (subpattern.back() == kSeparatorPlaceholder) {
216 subpattern.remove_suffix(1);
217 return DoesTextMatchLastSubpatternInternal(
218 anchor_left, proto::ANCHOR_TYPE_BOUNDARY, text, url_host, subpattern);
219 }
220
221 return false;
222}
223
Karan Bhatiaa9c4e1d2018-09-10 23:37:47224// Returns whether the given |url_pattern| matches the given |url_spec|.
225// Compares the pattern the the url in a case-sensitive manner.
226bool IsCaseSensitiveMatch(base::StringPiece url_pattern,
227 proto::AnchorType anchor_left,
228 proto::AnchorType anchor_right,
229 base::StringPiece url_spec,
230 url::Component url_host) {
231 DCHECK(!url_spec.empty());
232
233 StringSplitter<IsWildcard> subpatterns(url_pattern);
234 auto subpattern_it = subpatterns.begin();
235 auto subpattern_end = subpatterns.end();
236
Karan Bhatia1d1eaed22019-02-20 21:07:17237 // No subpatterns.
Karan Bhatiaa9c4e1d2018-09-10 23:37:47238 if (subpattern_it == subpattern_end) {
239 return anchor_left == proto::ANCHOR_TYPE_NONE ||
240 anchor_right == proto::ANCHOR_TYPE_NONE;
241 }
242
243 base::StringPiece subpattern = *subpattern_it;
244 ++subpattern_it;
245
Karan Bhatia1d1eaed22019-02-20 21:07:17246 // There is only one |subpattern|.
247 if (subpattern_it == subpattern_end) {
248 return DoesTextMatchLastSubpattern(anchor_left, anchor_right, url_spec,
249 url_host, subpattern);
Karan Bhatiaa9c4e1d2018-09-10 23:37:47250 }
251
252 // Otherwise, the first |subpattern| does not have to be a suffix. But it
253 // still can have a left anchor. Check and handle that.
254 base::StringPiece text = url_spec;
255 if (anchor_left == proto::ANCHOR_TYPE_BOUNDARY) {
256 if (!StartsWithFuzzy(url_spec, subpattern))
257 return false;
Karan Bhatiaa9c4e1d2018-09-10 23:37:47258 text.remove_prefix(subpattern.size());
259 } else if (anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
260 if (!url_host.is_nonempty())
261 return false;
262 const size_t match_begin =
263 FindSubdomainAnchoredSubpattern(url_spec, url_host, subpattern);
264 if (match_begin == base::StringPiece::npos)
265 return false;
Karan Bhatiaa9c4e1d2018-09-10 23:37:47266 text.remove_prefix(match_begin + subpattern.size());
267 } else {
268 DCHECK_EQ(anchor_left, proto::ANCHOR_TYPE_NONE);
269 // Get back to the initial |subpattern|, process it in the loop below.
270 subpattern_it = subpatterns.begin();
271 }
272
Karan Bhatia1d1eaed22019-02-20 21:07:17273 DCHECK(subpattern_it != subpattern_end);
274 subpattern = *subpattern_it;
Karan Bhatiaa9c4e1d2018-09-10 23:37:47275
Karan Bhatia1d1eaed22019-02-20 21:07:17276 // Consecutively find all the remaining subpatterns in the |text|. Handle the
277 // last subpattern outside the loop.
278 while (++subpattern_it != subpattern_end) {
279 DCHECK(!subpattern.empty());
Karan Bhatiaa9c4e1d2018-09-10 23:37:47280
281 const size_t match_position = FindSubpattern(text, subpattern);
282 if (match_position == base::StringPiece::npos)
283 return false;
284 text.remove_prefix(match_position + subpattern.size());
Karan Bhatia1d1eaed22019-02-20 21:07:17285
286 subpattern = *subpattern_it;
Karan Bhatiaa9c4e1d2018-09-10 23:37:47287 }
288
Karan Bhatia1d1eaed22019-02-20 21:07:17289 return DoesTextMatchLastSubpattern(proto::ANCHOR_TYPE_NONE, anchor_right,
290 text, url::Component(), subpattern);
Karan Bhatiaa9c4e1d2018-09-10 23:37:47291}
292
pkalinnikov15cf7242016-07-13 08:57:34293} // namespace
294
Karan Bhatiae0aeb0e2018-09-12 18:57:21295UrlPattern::UrlInfo::UrlInfo(const GURL& url)
296 : spec_(url.possibly_invalid_spec()),
Karan Bhatiae0aeb0e2018-09-12 18:57:21297 host_(url.parsed_for_possibly_invalid_spec().host) {
298 DCHECK(url.is_valid());
299}
300
Karan Bhatiae177fb62018-09-14 00:57:30301base::StringPiece UrlPattern::UrlInfo::GetLowerCaseSpec() const {
302 if (lower_case_spec_cached_)
303 return *lower_case_spec_cached_;
304
305 if (!HasAnyUpperAscii(spec_)) {
306 lower_case_spec_cached_ = spec_;
307 } else {
308 lower_case_spec_owner_ = base::ToLowerASCII(spec_);
309 lower_case_spec_cached_ = lower_case_spec_owner_;
310 }
311 return *lower_case_spec_cached_;
312}
313
Karan Bhatiae0aeb0e2018-09-12 18:57:21314UrlPattern::UrlInfo::~UrlInfo() = default;
315
pkalinnikov15cf7242016-07-13 08:57:34316UrlPattern::UrlPattern() = default;
317
318UrlPattern::UrlPattern(base::StringPiece url_pattern,
Karan Bhatiaa9c4e1d2018-09-10 23:37:47319 proto::UrlPatternType type,
320 MatchCase match_case)
321 : type_(type), url_pattern_(url_pattern), match_case_(match_case) {}
pkalinnikov15cf7242016-07-13 08:57:34322
323UrlPattern::UrlPattern(base::StringPiece url_pattern,
324 proto::AnchorType anchor_left,
325 proto::AnchorType anchor_right)
pkalinnikov35d18812017-04-05 17:28:18326 : type_(proto::URL_PATTERN_TYPE_WILDCARDED),
327 url_pattern_(url_pattern),
328 anchor_left_(anchor_left),
329 anchor_right_(anchor_right) {}
pkalinnikov15cf7242016-07-13 08:57:34330
331UrlPattern::UrlPattern(const flat::UrlRule& rule)
pkalinnikov35d18812017-04-05 17:28:18332 : type_(ConvertUrlPatternType(rule.url_pattern_type())),
333 url_pattern_(ConvertString(rule.url_pattern())),
334 anchor_left_(ConvertAnchorType(rule.anchor_left())),
335 anchor_right_(ConvertAnchorType(rule.anchor_right())),
Charlie Harrison8d71f6f2018-09-14 14:43:26336 match_case_(rule.options() & flat::OptionFlag_IS_CASE_INSENSITIVE
337 ? MatchCase::kFalse
338 : MatchCase::kTrue) {}
pkalinnikov15cf7242016-07-13 08:57:34339
pkalinnikov15cf7242016-07-13 08:57:34340UrlPattern::~UrlPattern() = default;
341
Karan Bhatiae0aeb0e2018-09-12 18:57:21342bool UrlPattern::MatchesUrl(const UrlInfo& url) const {
pkalinnikov35d18812017-04-05 17:28:18343 DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
344 type_ == proto::URL_PATTERN_TYPE_WILDCARDED);
Karan Bhatiaa9c4e1d2018-09-10 23:37:47345 DCHECK(base::IsStringASCII(url_pattern_));
Karan Bhatiae0aeb0e2018-09-12 18:57:21346 DCHECK(base::IsStringASCII(url.spec()));
Karan Bhatiae177fb62018-09-14 00:57:30347 DCHECK(base::IsStringASCII(url.GetLowerCaseSpec()));
pkalinnikov35d18812017-04-05 17:28:18348
Karandeep Bhatiacf2b1a02019-02-25 23:09:31349 // Pre-process patterns to ensure left anchored and right anchored patterns
350 // don't begin and end with a wildcard respectively i.e. change "|*xyz" to
351 // "*xyz" and "xyz*|" to "xyz*".
352 proto::AnchorType anchor_left = anchor_left_;
353 proto::AnchorType anchor_right = anchor_right_;
354 if (!url_pattern_.empty()) {
355 if (url_pattern_.front() == kWildcard) {
356 // Note: We don't handle "||*" and expect clients to disallow it.
357 DCHECK_NE(proto::ANCHOR_TYPE_SUBDOMAIN, anchor_left_);
358 anchor_left = proto::ANCHOR_TYPE_NONE;
359 }
360 if (url_pattern_.back() == kWildcard)
361 anchor_right = proto::ANCHOR_TYPE_NONE;
362 }
363
Karan Bhatiae0aeb0e2018-09-12 18:57:21364 if (match_case()) {
Karandeep Bhatiacf2b1a02019-02-25 23:09:31365 return IsCaseSensitiveMatch(url_pattern_, anchor_left, anchor_right,
Karan Bhatiae0aeb0e2018-09-12 18:57:21366 url.spec(), url.host());
pkalinnikov35d18812017-04-05 17:28:18367 }
368
Karan Bhatiaa06f6822018-09-18 00:05:49369 // Use the lower-cased url for case-insensitive comparison. Case-insensitive
370 // patterns should already be lower-cased.
371 DCHECK(!HasAnyUpperAscii(url_pattern_));
Karandeep Bhatiacf2b1a02019-02-25 23:09:31372 return IsCaseSensitiveMatch(url_pattern_, anchor_left, anchor_right,
Karan Bhatiaa06f6822018-09-18 00:05:49373 url.GetLowerCaseSpec(), url.host());
pkalinnikov35d18812017-04-05 17:28:18374}
375
376std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
pkalinnikov35d18812017-04-05 17:28:18377 switch (pattern.anchor_left()) {
378 case proto::ANCHOR_TYPE_SUBDOMAIN:
379 out << '|';
Nico Weberb1cea5c2018-01-29 22:26:07380 FALLTHROUGH;
pkalinnikov35d18812017-04-05 17:28:18381 case proto::ANCHOR_TYPE_BOUNDARY:
382 out << '|';
Nico Weberb1cea5c2018-01-29 22:26:07383 FALLTHROUGH;
pkalinnikov35d18812017-04-05 17:28:18384 default:
385 break;
386 }
387 out << pattern.url_pattern();
388 if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)
389 out << '|';
390 if (pattern.match_case())
391 out << "$match-case";
392
393 return out;
394}
395
Pavel Kalinnikovd7970632017-06-20 09:07:34396} // namespace url_pattern_index