pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 1 | // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
Pavel Kalinnikov | d797063 | 2017-06-20 09:07:34 | [diff] [blame] | 5 | #ifndef COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_ |
| 6 | #define COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_ |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 7 | |
| 8 | #include <stddef.h> |
| 9 | |
| 10 | #include <iterator> |
| 11 | #include <type_traits> |
| 12 | |
| 13 | #include "base/logging.h" |
| 14 | #include "base/strings/string_piece.h" |
Charlie Harrison | 03d14673 | 2018-09-13 20:37:02 | [diff] [blame] | 15 | #include "base/strings/string_util.h" |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 16 | |
Pavel Kalinnikov | d797063 | 2017-06-20 09:07:34 | [diff] [blame] | 17 | namespace url_pattern_index { |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 18 | |
| 19 | // The class used to iteratively extract N-grams from strings. An N-gram is a |
| 20 | // string consisting of N (up to 8) non-special characters, which are stored in |
| 21 | // the lowest N non-zero bytes, lower bytes corresponding to later symbols. The |
| 22 | // size of the integer type limits the maximum value of N. For example an |
| 23 | // uint64_t can store up to 8-grams. |
| 24 | // |
| 25 | // Note: If used for UTF-8 strings, the N-grams can have partial byte sequences. |
| 26 | // |
| 27 | // Template parameters: |
| 28 | // * N - the size of N-grams. |
| 29 | // * NGramType - the integer type used to encode N-grams. |
Charlie Harrison | 03d14673 | 2018-09-13 20:37:02 | [diff] [blame] | 30 | // * CasePolicy - whether or not to lower-case the N-grams. Assumes ASCII. |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 31 | // * IsSeparator - the type of a bool(char) functor. |
Charlie Harrison | 03d14673 | 2018-09-13 20:37:02 | [diff] [blame] | 32 | enum class NGramCaseExtraction { kCaseSensitive, kLowerCase }; |
| 33 | template <size_t N, |
| 34 | typename NGramType, |
| 35 | NGramCaseExtraction CasePolicy, |
| 36 | typename IsSeparator> |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 37 | class NGramExtractor { |
| 38 | public: |
| 39 | // An STL compatible input iterator over N-grams contained in a string. |
| 40 | class Iterator : public std::iterator<std::input_iterator_tag, NGramType> { |
| 41 | public: |
| 42 | // Creates an iterator, which points to the leftmost valid N-gram within the |
| 43 | // |extractor|'s string, starting from |head|. |
| 44 | Iterator(const NGramExtractor& extractor, |
| 45 | base::StringPiece::const_iterator head) |
| 46 | : extractor_(extractor), head_(head), end_(extractor.string_.end()) { |
| 47 | DCHECK_GE(head, extractor_.string_.begin()); |
| 48 | DCHECK_LE(head, end_); |
| 49 | |
| 50 | CompleteNGramFrom(0); |
| 51 | } |
| 52 | |
| 53 | bool operator==(const Iterator& rhs) const { return head_ == rhs.head_; } |
| 54 | bool operator!=(const Iterator& rhs) const { return !operator==(rhs); } |
| 55 | |
| 56 | NGramType operator*() const { return ngram_; } |
| 57 | NGramType* operator->() const { return &ngram_; } |
| 58 | |
| 59 | Iterator& operator++() { |
| 60 | ngram_ &= ~(static_cast<NGramType>(0xFFu) << 8 * (N - 1)); |
| 61 | ++head_; |
| 62 | CompleteNGramFrom(N - 1); |
| 63 | return *this; |
| 64 | } |
| 65 | |
| 66 | Iterator operator++(int) { |
| 67 | Iterator copy(*this); |
| 68 | operator++(); |
| 69 | return copy; |
| 70 | } |
| 71 | |
| 72 | private: |
Charlie Harrison | 03d14673 | 2018-09-13 20:37:02 | [diff] [blame] | 73 | char ExtractHeadByte() { |
| 74 | char head_byte = *head_; |
| 75 | switch (CasePolicy) { |
| 76 | case NGramCaseExtraction::kCaseSensitive: |
| 77 | return head_byte; |
| 78 | case NGramCaseExtraction::kLowerCase: |
| 79 | return base::ToLowerASCII(head_byte); |
| 80 | } |
| 81 | } |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 82 | // Consumes characters starting with the one pointed to by |head_|, as many |
| 83 | // of them as needed to extend |ngram_| from its |current_length| to a |
| 84 | // length of N. Leaves |head_| pointing to the last character consumed. |
| 85 | void CompleteNGramFrom(size_t current_length) { |
| 86 | for (; head_ != end_; ++head_) { |
| 87 | if (extractor_.is_separator_(*head_)) { |
| 88 | current_length = 0; |
| 89 | ngram_ = 0; |
| 90 | } else { |
Charlie Harrison | 03d14673 | 2018-09-13 20:37:02 | [diff] [blame] | 91 | ngram_ = ngram_ << 8 | static_cast<NGramType>(ExtractHeadByte()); |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 92 | if (++current_length == N) |
| 93 | break; |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | const NGramExtractor& extractor_; |
| 99 | |
| 100 | // Always points to the last character included in the current |ngram_|. |
| 101 | base::StringPiece::const_iterator head_; |
| 102 | // Always points to extractor_.string_.end(). |
| 103 | base::StringPiece::const_iterator end_; |
| 104 | |
| 105 | // Contains the N-gram currently pointed to by the iterator. Undefined if |
| 106 | // the iterator is at the end. |
| 107 | NGramType ngram_ = 0; |
| 108 | }; |
| 109 | |
| 110 | // Constructs an extractor for iterating over N-grams contained in the |
| 111 | // |string|. |is_separator| is used to determine whether a certain character |
| 112 | // is a separator and should not be contained in an N-gram. |
| 113 | NGramExtractor(base::StringPiece string, IsSeparator is_separator) |
| 114 | : string_(string), is_separator_(is_separator) {} |
| 115 | |
| 116 | Iterator begin() const { return Iterator(*this, string_.begin()); } |
| 117 | Iterator end() const { return Iterator(*this, string_.end()); } |
| 118 | |
| 119 | private: |
| 120 | static_assert(std::is_integral<NGramType>::value, "Not an integral type."); |
| 121 | static_assert(std::is_unsigned<NGramType>::value, "Not an unsigned type."); |
| 122 | static_assert(N > 0u, "N should be positive."); |
| 123 | static_assert(N <= sizeof(NGramType), "N-gram doesn't fit into the type."); |
| 124 | |
| 125 | base::StringPiece string_; |
| 126 | IsSeparator is_separator_; |
| 127 | }; |
| 128 | |
| 129 | // A helper function used to create an NGramExtractor for a |string| without |
| 130 | // knowing the direct type of the |is_separator| functor. |
| 131 | // |
| 132 | // Typical usage: |
| 133 | // const char* str = "no*abacaba*abcd"; |
Charlie Harrison | 03d14673 | 2018-09-13 20:37:02 | [diff] [blame] | 134 | // auto extractor = |
| 135 | // CreateNGramExtractor<5, uint64_t, NGrameCaseExtraction::kLowercase>( |
| 136 | // str, [](char c) { return c == '*'; }); |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 137 | // for (uint64_t ngram : extractor) { |
| 138 | // ... process the |ngram| ... |
| 139 | // } |
Charlie Harrison | 03d14673 | 2018-09-13 20:37:02 | [diff] [blame] | 140 | template <size_t N, |
| 141 | typename NGramType, |
| 142 | NGramCaseExtraction CasePolicy, |
| 143 | typename IsSeparator> |
| 144 | NGramExtractor<N, NGramType, CasePolicy, IsSeparator> CreateNGramExtractor( |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 145 | base::StringPiece string, |
| 146 | IsSeparator is_separator) { |
Charlie Harrison | 03d14673 | 2018-09-13 20:37:02 | [diff] [blame] | 147 | return NGramExtractor<N, NGramType, CasePolicy, IsSeparator>(string, |
| 148 | is_separator); |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 149 | } |
| 150 | |
Pavel Kalinnikov | d797063 | 2017-06-20 09:07:34 | [diff] [blame] | 151 | } // namespace url_pattern_index |
pkalinnikov | e77a62e | 2016-06-24 10:21:40 | [diff] [blame] | 152 | |
Pavel Kalinnikov | d797063 | 2017-06-20 09:07:34 | [diff] [blame] | 153 | #endif // COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_ |