blob: 4c13198ec2e4241bb9e124f9a4a6ab0dcaa8c1c7 [file] [log] [blame]
pkalinnikove77a62e2016-06-24 10:21:401// Copyright 2016 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
Pavel Kalinnikovd7970632017-06-20 09:07:345#ifndef COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
6#define COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
pkalinnikove77a62e2016-06-24 10:21:407
8#include <stddef.h>
9
10#include <iterator>
11#include <type_traits>
12
13#include "base/logging.h"
14#include "base/strings/string_piece.h"
Charlie Harrison03d146732018-09-13 20:37:0215#include "base/strings/string_util.h"
pkalinnikove77a62e2016-06-24 10:21:4016
Pavel Kalinnikovd7970632017-06-20 09:07:3417namespace url_pattern_index {
pkalinnikove77a62e2016-06-24 10:21:4018
19// The class used to iteratively extract N-grams from strings. An N-gram is a
20// string consisting of N (up to 8) non-special characters, which are stored in
21// the lowest N non-zero bytes, lower bytes corresponding to later symbols. The
22// size of the integer type limits the maximum value of N. For example an
23// uint64_t can store up to 8-grams.
24//
25// Note: If used for UTF-8 strings, the N-grams can have partial byte sequences.
26//
27// Template parameters:
28// * N - the size of N-grams.
29// * NGramType - the integer type used to encode N-grams.
Charlie Harrison03d146732018-09-13 20:37:0230// * CasePolicy - whether or not to lower-case the N-grams. Assumes ASCII.
pkalinnikove77a62e2016-06-24 10:21:4031// * IsSeparator - the type of a bool(char) functor.
Charlie Harrison03d146732018-09-13 20:37:0232enum class NGramCaseExtraction { kCaseSensitive, kLowerCase };
33template <size_t N,
34 typename NGramType,
35 NGramCaseExtraction CasePolicy,
36 typename IsSeparator>
pkalinnikove77a62e2016-06-24 10:21:4037class NGramExtractor {
38 public:
39 // An STL compatible input iterator over N-grams contained in a string.
40 class Iterator : public std::iterator<std::input_iterator_tag, NGramType> {
41 public:
42 // Creates an iterator, which points to the leftmost valid N-gram within the
43 // |extractor|'s string, starting from |head|.
44 Iterator(const NGramExtractor& extractor,
45 base::StringPiece::const_iterator head)
46 : extractor_(extractor), head_(head), end_(extractor.string_.end()) {
47 DCHECK_GE(head, extractor_.string_.begin());
48 DCHECK_LE(head, end_);
49
50 CompleteNGramFrom(0);
51 }
52
53 bool operator==(const Iterator& rhs) const { return head_ == rhs.head_; }
54 bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }
55
56 NGramType operator*() const { return ngram_; }
57 NGramType* operator->() const { return &ngram_; }
58
59 Iterator& operator++() {
60 ngram_ &= ~(static_cast<NGramType>(0xFFu) << 8 * (N - 1));
61 ++head_;
62 CompleteNGramFrom(N - 1);
63 return *this;
64 }
65
66 Iterator operator++(int) {
67 Iterator copy(*this);
68 operator++();
69 return copy;
70 }
71
72 private:
Charlie Harrison03d146732018-09-13 20:37:0273 char ExtractHeadByte() {
74 char head_byte = *head_;
75 switch (CasePolicy) {
76 case NGramCaseExtraction::kCaseSensitive:
77 return head_byte;
78 case NGramCaseExtraction::kLowerCase:
79 return base::ToLowerASCII(head_byte);
80 }
81 }
pkalinnikove77a62e2016-06-24 10:21:4082 // Consumes characters starting with the one pointed to by |head_|, as many
83 // of them as needed to extend |ngram_| from its |current_length| to a
84 // length of N. Leaves |head_| pointing to the last character consumed.
85 void CompleteNGramFrom(size_t current_length) {
86 for (; head_ != end_; ++head_) {
87 if (extractor_.is_separator_(*head_)) {
88 current_length = 0;
89 ngram_ = 0;
90 } else {
Charlie Harrison03d146732018-09-13 20:37:0291 ngram_ = ngram_ << 8 | static_cast<NGramType>(ExtractHeadByte());
pkalinnikove77a62e2016-06-24 10:21:4092 if (++current_length == N)
93 break;
94 }
95 }
96 }
97
98 const NGramExtractor& extractor_;
99
100 // Always points to the last character included in the current |ngram_|.
101 base::StringPiece::const_iterator head_;
102 // Always points to extractor_.string_.end().
103 base::StringPiece::const_iterator end_;
104
105 // Contains the N-gram currently pointed to by the iterator. Undefined if
106 // the iterator is at the end.
107 NGramType ngram_ = 0;
108 };
109
110 // Constructs an extractor for iterating over N-grams contained in the
111 // |string|. |is_separator| is used to determine whether a certain character
112 // is a separator and should not be contained in an N-gram.
113 NGramExtractor(base::StringPiece string, IsSeparator is_separator)
114 : string_(string), is_separator_(is_separator) {}
115
116 Iterator begin() const { return Iterator(*this, string_.begin()); }
117 Iterator end() const { return Iterator(*this, string_.end()); }
118
119 private:
120 static_assert(std::is_integral<NGramType>::value, "Not an integral type.");
121 static_assert(std::is_unsigned<NGramType>::value, "Not an unsigned type.");
122 static_assert(N > 0u, "N should be positive.");
123 static_assert(N <= sizeof(NGramType), "N-gram doesn't fit into the type.");
124
125 base::StringPiece string_;
126 IsSeparator is_separator_;
127};
128
129// A helper function used to create an NGramExtractor for a |string| without
130// knowing the direct type of the |is_separator| functor.
131//
132// Typical usage:
133// const char* str = "no*abacaba*abcd";
Charlie Harrison03d146732018-09-13 20:37:02134// auto extractor =
135// CreateNGramExtractor<5, uint64_t, NGrameCaseExtraction::kLowercase>(
136// str, [](char c) { return c == '*'; });
pkalinnikove77a62e2016-06-24 10:21:40137// for (uint64_t ngram : extractor) {
138// ... process the |ngram| ...
139// }
Charlie Harrison03d146732018-09-13 20:37:02140template <size_t N,
141 typename NGramType,
142 NGramCaseExtraction CasePolicy,
143 typename IsSeparator>
144NGramExtractor<N, NGramType, CasePolicy, IsSeparator> CreateNGramExtractor(
pkalinnikove77a62e2016-06-24 10:21:40145 base::StringPiece string,
146 IsSeparator is_separator) {
Charlie Harrison03d146732018-09-13 20:37:02147 return NGramExtractor<N, NGramType, CasePolicy, IsSeparator>(string,
148 is_separator);
pkalinnikove77a62e2016-06-24 10:21:40149}
150
Pavel Kalinnikovd7970632017-06-20 09:07:34151} // namespace url_pattern_index
pkalinnikove77a62e2016-06-24 10:21:40152
Pavel Kalinnikovd7970632017-06-20 09:07:34153#endif // COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_