Blame - components/url_pattern_index/ngram_extractor.h - chromium/src

blob: 4c13198ec2e4241bb9e124f9a4a6ab0dcaa8c1c7 [file] [log] [blame]

pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	1	// Copyright 2016 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
Pavel Kalinnikov	d797063	2017-06-20 09:07:34	[diff] [blame]	5	#ifndef COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
				6	#define COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	7
				8	#include <stddef.h>
				9
				10	#include <iterator>
				11	#include <type_traits>
				12
				13	#include "base/logging.h"
				14	#include "base/strings/string_piece.h"
Charlie Harrison	03d14673	2018-09-13 20:37:02	[diff] [blame]	15	#include "base/strings/string_util.h"
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	16
Pavel Kalinnikov	d797063	2017-06-20 09:07:34	[diff] [blame]	17	namespace url_pattern_index {
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	18
				19	// The class used to iteratively extract N-grams from strings. An N-gram is a
				20	// string consisting of N (up to 8) non-special characters, which are stored in
				21	// the lowest N non-zero bytes, lower bytes corresponding to later symbols. The
				22	// size of the integer type limits the maximum value of N. For example an
				23	// uint64_t can store up to 8-grams.
				24	//
				25	// Note: If used for UTF-8 strings, the N-grams can have partial byte sequences.
				26	//
				27	// Template parameters:
				28	// * N - the size of N-grams.
				29	// * NGramType - the integer type used to encode N-grams.
Charlie Harrison	03d14673	2018-09-13 20:37:02	[diff] [blame]	30	// * CasePolicy - whether or not to lower-case the N-grams. Assumes ASCII.
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	31	// * IsSeparator - the type of a bool(char) functor.
Charlie Harrison	03d14673	2018-09-13 20:37:02	[diff] [blame]	32	enum class NGramCaseExtraction { kCaseSensitive, kLowerCase };
				33	template <size_t N,
				34	typename NGramType,
				35	NGramCaseExtraction CasePolicy,
				36	typename IsSeparator>
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	37	class NGramExtractor {
				38	public:
				39	// An STL compatible input iterator over N-grams contained in a string.
				40	class Iterator : public std::iterator<std::input_iterator_tag, NGramType> {
				41	public:
				42	// Creates an iterator, which points to the leftmost valid N-gram within the
				43	// \|extractor\|'s string, starting from \|head\|.
				44	Iterator(const NGramExtractor& extractor,
				45	base::StringPiece::const_iterator head)
				46	: extractor_(extractor), head_(head), end_(extractor.string_.end()) {
				47	DCHECK_GE(head, extractor_.string_.begin());
				48	DCHECK_LE(head, end_);
				49
				50	CompleteNGramFrom(0);
				51	}
				52
				53	bool operator==(const Iterator& rhs) const { return head_ == rhs.head_; }
				54	bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }
				55
				56	NGramType operator*() const { return ngram_; }
				57	NGramType* operator->() const { return &ngram_; }
				58
				59	Iterator& operator++() {
				60	ngram_ &= ~(static_cast<NGramType>(0xFFu) << 8 * (N - 1));
				61	++head_;
				62	CompleteNGramFrom(N - 1);
				63	return *this;
				64	}
				65
				66	Iterator operator++(int) {
				67	Iterator copy(*this);
				68	operator++();
				69	return copy;
				70	}
				71
				72	private:
Charlie Harrison	03d14673	2018-09-13 20:37:02	[diff] [blame]	73	char ExtractHeadByte() {
				74	char head_byte = *head_;
				75	switch (CasePolicy) {
				76	case NGramCaseExtraction::kCaseSensitive:
				77	return head_byte;
				78	case NGramCaseExtraction::kLowerCase:
				79	return base::ToLowerASCII(head_byte);
				80	}
				81	}
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	82	// Consumes characters starting with the one pointed to by \|head_\|, as many
				83	// of them as needed to extend \|ngram_\| from its \|current_length\| to a
				84	// length of N. Leaves \|head_\| pointing to the last character consumed.
				85	void CompleteNGramFrom(size_t current_length) {
				86	for (; head_ != end_; ++head_) {
				87	if (extractor_.is_separator_(*head_)) {
				88	current_length = 0;
				89	ngram_ = 0;
				90	} else {
Charlie Harrison	03d14673	2018-09-13 20:37:02	[diff] [blame]	91	ngram_ = ngram_ << 8 \| static_cast<NGramType>(ExtractHeadByte());
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	92	if (++current_length == N)
				93	break;
				94	}
				95	}
				96	}
				97
				98	const NGramExtractor& extractor_;
				99
				100	// Always points to the last character included in the current \|ngram_\|.
				101	base::StringPiece::const_iterator head_;
				102	// Always points to extractor_.string_.end().
				103	base::StringPiece::const_iterator end_;
				104
				105	// Contains the N-gram currently pointed to by the iterator. Undefined if
				106	// the iterator is at the end.
				107	NGramType ngram_ = 0;
				108	};
				109
				110	// Constructs an extractor for iterating over N-grams contained in the
				111	// \|string\|. \|is_separator\| is used to determine whether a certain character
				112	// is a separator and should not be contained in an N-gram.
				113	NGramExtractor(base::StringPiece string, IsSeparator is_separator)
				114	: string_(string), is_separator_(is_separator) {}
				115
				116	Iterator begin() const { return Iterator(*this, string_.begin()); }
				117	Iterator end() const { return Iterator(*this, string_.end()); }
				118
				119	private:
				120	static_assert(std::is_integral<NGramType>::value, "Not an integral type.");
				121	static_assert(std::is_unsigned<NGramType>::value, "Not an unsigned type.");
				122	static_assert(N > 0u, "N should be positive.");
				123	static_assert(N <= sizeof(NGramType), "N-gram doesn't fit into the type.");
				124
				125	base::StringPiece string_;
				126	IsSeparator is_separator_;
				127	};
				128
				129	// A helper function used to create an NGramExtractor for a \|string\| without
				130	// knowing the direct type of the \|is_separator\| functor.
				131	//
				132	// Typical usage:
				133	// const char* str = "noabacabaabcd";
Charlie Harrison	03d14673	2018-09-13 20:37:02	[diff] [blame]	134	// auto extractor =
				135	// CreateNGramExtractor<5, uint64_t, NGrameCaseExtraction::kLowercase>(
				136	// str, [](char c) { return c == '*'; });
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	137	// for (uint64_t ngram : extractor) {
				138	// ... process the \|ngram\| ...
				139	// }
Charlie Harrison	03d14673	2018-09-13 20:37:02	[diff] [blame]	140	template <size_t N,
				141	typename NGramType,
				142	NGramCaseExtraction CasePolicy,
				143	typename IsSeparator>
				144	NGramExtractor<N, NGramType, CasePolicy, IsSeparator> CreateNGramExtractor(
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	145	base::StringPiece string,
				146	IsSeparator is_separator) {
Charlie Harrison	03d14673	2018-09-13 20:37:02	[diff] [blame]	147	return NGramExtractor<N, NGramType, CasePolicy, IsSeparator>(string,
				148	is_separator);
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	149	}
				150
Pavel Kalinnikov	d797063	2017-06-20 09:07:34	[diff] [blame]	151	} // namespace url_pattern_index
pkalinnikov	e77a62e	2016-06-24 10:21:40	[diff] [blame]	152
Pavel Kalinnikov	d797063	2017-06-20 09:07:34	[diff] [blame]	153	#endif // COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_