Added anonymization patterns for URLs and email addresses

BUG=567870

Review URL: https://codereview.chromium.org/1543633003

Cr-Commit-Position: refs/heads/master@{#368573}
diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc
index 713ceb6..57d1cd7 100644
--- a/components/feedback/anonymizer_tool.cc
+++ b/components/feedback/anonymizer_tool.cc
@@ -4,9 +4,11 @@
 
 #include "components/feedback/anonymizer_tool.h"
 
-#include <base/strings/string_number_conversions.h>
-#include <base/strings/string_util.h>
-#include <base/strings/stringprintf.h>
+#include <utility>
+
+#include "base/strings/string_number_conversions.h"
+#include "base/strings/string_util.h"
+#include "base/strings/stringprintf.h"
 
 #include "third_party/re2/src/re2/re2.h"
 
@@ -16,13 +18,16 @@
 
 namespace {
 
-// The |kCustomPatterns| array defines patterns to match and anonymize. Each
-// pattern needs to define three capturing parentheses groups:
+// The |kCustomPatternsWithContext| array defines patterns to match and
+// anonymize. Each pattern needs to define three capturing parentheses groups:
 //
 // - a group for the pattern before the identifier to be anonymized;
 // - a group for the identifier to be anonymized;
 // - a group for the pattern after the identifier to be anonymized.
 //
+// The first and the last capture group are the origin of the "WithContext"
+// suffix in the name of this constant.
+//
 // Every matched identifier (in the context of the whole pattern) is anonymized
 // by replacing it with an incremental instance identifier. Every different
 // pattern defines a separate instance identifier space. See the unit test for
@@ -35,7 +40,7 @@
 // (?i) turns on case insensitivy for the remainder of the regex.
 // (?-s) turns off "dot matches newline" for the remainder of the regex.
 // (?:regex) denotes non-capturing parentheses group.
-const char* kCustomPatterns[] = {
+const char* kCustomPatternsWithContext[] = {
     "(\\bCell ID: ')([0-9a-fA-F]+)(')",                  // ModemManager
     "(\\bLocation area code: ')([0-9a-fA-F]+)(')",       // ModemManager
     "(?i-s)(\\bssid[= ]')(.+)(')",                       // wpa_supplicant
@@ -43,10 +48,183 @@
     "(?-s)(\\[SSID=)(.+?)(\\])",                         // shill
 };
 
+// Helper macro: Non capturing group
+#define NCG(x) "(?:" x ")"
+// Helper macro: Optional non capturing group
+#define OPT_NCG(x) NCG(x) "?"
+
+//////////////////////////////////////////////////////////////////////////
+// Patterns for URLs, or better IRIs, based on RFC 3987 with an artificial
+// limitation on the scheme to increase precision. Otherwise anything
+// like "ID:" would be considered an IRI.
+
+#define UNRESERVED "[-a-z0-9._~]"
+#define RESERVED NGC(GEN_DELIMS "|" SUB_DELIMS)
+#define SUB_DELIMS "[!$&'()*+,;=]"
+#define GEN_DELIMS "[:/?#[\\]@]"
+
+#define DIGIT "[0-9]"
+#define HEXDIG "[0-9a-f]"
+
+#define PCT_ENCODED "%" HEXDIG HEXDIG
+
+#define DEC_OCTET NCG("[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-9]")
+
+#define IPV4ADDRESS DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET "\\." DEC_OCTET
+
+#define H16 NCG(HEXDIG) "{1,4}"
+#define LS32 NCG(H16 ":" H16 "|" IPV4ADDRESS)
+
+#define IPV6ADDRESS NCG( \
+                                          NCG(H16 ":") "{6}" LS32 "|" \
+                                     "::" NCG(H16 ":") "{5}" LS32 "|" \
+  OPT_NCG(                      H16) "::" NCG(H16 ":") "{4}" LS32 "|" \
+  OPT_NCG( NCG(H16 ":") "{0,1}" H16) "::" NCG(H16 ":") "{3}" LS32 "|" \
+  OPT_NCG( NCG(H16 ":") "{0,2}" H16) "::" NCG(H16 ":") "{2}" LS32 "|" \
+  OPT_NCG( NCG(H16 ":") "{0,3}" H16) "::" NCG(H16 ":")       LS32 "|" \
+  OPT_NCG( NCG(H16 ":") "{0,4}" H16) "::"                    LS32 "|" \
+  OPT_NCG( NCG(H16 ":") "{0,5}" H16) "::"                    H16 "|" \
+  OPT_NCG( NCG(H16 ":") "{0,6}" H16) "::")
+
+#define IPVFUTURE                     \
+  "v" HEXDIG                          \
+  "+"                                 \
+  "\\." NCG(UNRESERVED "|" SUB_DELIMS \
+                       "|"            \
+                       ":") "+"
+
+#define IP_LITERAL "\\[" NCG(IPV6ADDRESS "|" IPVFUTURE) "\\]"
+
+#define PORT DIGIT "*"
+
+// This is a diversion of RFC 3987
+#define SCHEME NCG("http|https|ftp|chrome|chrome-extension|android")
+
+#define IPRIVATE            \
+  "["                       \
+  "\\x{E000}-\\x{F8FF}"     \
+  "\\x{F0000}-\\x{FFFFD}"   \
+  "\\x{100000}-\\x{10FFFD}" \
+  "]"
+
+#define UCSCHAR \
+  "[" "\\x{A0}-\\x{D7FF}" "\\x{F900}-\\x{FDCF}" "\\x{FDF0}-\\x{FFEF}" \
+  "\\x{10000}-\\x{1FFFD}" "\\x{20000}-\\x{2FFFD}" "\\x{30000}-\\x{3FFFD}" \
+  "\\x{40000}-\\x{4FFFD}" "\\x{50000}-\\x{5FFFD}" "\\x{60000}-\\x{6FFFD}" \
+  "\\x{70000}-\\x{7FFFD}" "\\x{80000}-\\x{8FFFD}" "\\x{90000}-\\x{9FFFD}" \
+  "\\x{A0000}-\\x{AFFFD}" "\\x{B0000}-\\x{BFFFD}" "\\x{C0000}-\\x{CFFFD}" \
+  "\\x{D0000}-\\x{DFFFD}" "\\x{E1000}-\\x{EFFFD}" "]"
+
+#define IUNRESERVED NCG("[-a-z0-9._~]" "|" UCSCHAR)
+
+#define IPCHAR NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" "[:@]")
+#define IFRAGMENT NCG(IPCHAR "|" "[/?]") "*"
+#define IQUERY NCG(IPCHAR "|" IPRIVATE "|" "[/?]") "*"
+
+#define ISEGMENT IPCHAR "*"
+#define ISEGMENT_NZ IPCHAR "+"
+#define ISEGMENT_NZ_NC                           \
+  NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS \
+                  "|" "@") "+"
+
+#define IPATH_EMPTY ""
+#define IPATH_ROOTLESS ISEGMENT_NZ NCG("/" ISEGMENT) "*"
+#define IPATH_NOSCHEME ISEGMENT_NZ_NC NCG("/" ISEGMENT) "*"
+#define IPATH_ABSOLUTE "/" OPT_NCG(ISEGMENT_NZ NCG("/" ISEGMENT) "*")
+#define IPATH_ABEMPTY NCG("/" ISEGMENT) "*"
+
+#define IPATH NCG(IPATH_ABEMPTY "|" IPATH_ABSOLUTE "|" IPATH_NOSCHEME "|" \
+                  IPATH_ROOTLESS "|" IPATH_EMPTY)
+
+#define IREG_NAME NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS) "*"
+
+#define IHOST NCG(IP_LITERAL "|" IPV4ADDRESS "|" IREG_NAME)
+#define IUSERINFO NCG(IUNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|" ":") "*"
+#define IAUTHORITY OPT_NCG(IUSERINFO "@") IHOST OPT_NCG(":" PORT)
+
+#define IRELATIVE_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \
+                           "|" IPATH_NOSCHEME "|" IPATH_EMPTY)
+
+#define IRELATIVE_REF IRELATIVE_PART OPT_NCG("?" IQUERY) OPT_NCG("#" IFRAGMENT)
+
+// RFC 3987 requires IPATH_EMPTY here but it is omitted so that statements
+// that end with "Android:" for example are not considered a URL.
+#define IHIER_PART NCG("//" IAUTHORITY IPATH_ABEMPTY "|" IPATH_ABSOLUTE \
+                       "|" IPATH_ROOTLESS)
+
+#define ABSOLUTE_IRI SCHEME ":" IHIER_PART OPT_NCG("?" IQUERY)
+
+#define IRI SCHEME ":" IHIER_PART OPT_NCG("\\?" IQUERY) OPT_NCG("#" IFRAGMENT)
+
+#define IRI_REFERENCE NCG(IRI "|" IRELATIVE_REF)
+
+// TODO(battre): Use http://tools.ietf.org/html/rfc5322 to represent email
+// addresses. Capture names as well ("First Lastname" <[email protected]>).
+
+// The |kCustomPatternWithoutContext| array defines further patterns to match
+// and anonymize. Each pattern consists of a single capturing group.
+CustomPatternWithoutContext kCustomPatternsWithoutContext[] = {
+  {"URL", "(?i)(" IRI ")"},
+  // Email Addresses need to come after URLs because they can be part
+  // of a query parameter.
+  {"email", "(?i)([0-9a-z._%+-]+@[a-z0-9.-]+\\.[a-z]{2,6})"},
+  // IP filter rules need to come after URLs so that they don't disturb the
+  // URL pattern in case the IP address is part of a URL.
+  {"IPv4", "(?i)(" IPV4ADDRESS ")"},
+  {"IPv6", "(?i)(" IPV6ADDRESS ")"},
+};
+
+// Like RE2's FindAndConsume, searches for the first occurrence of |pattern| in
+// |input| and consumes the bytes until the end of the pattern matching. Unlike
+// FindAndConsume, the bytes skipped before the match of |pattern| are stored
+// in |skipped_input|. |args| needs to contain at least one element.
+// Returns whether a match was found.
+//
+// Example: input = "aaabbbc", pattern = "(b+)" leads to skipped_input = "aaa",
+// args[0] = "bbb", and the beginning input is moved to the right so that it
+// only contains "c".
+// Example: input = "aaabbbc", pattern = "(z+)" leads to input = "aaabbbc",
+// the args values are not modified and skipped_input is not modified.
+bool FindAndConsumeAndGetSkippedN(re2::StringPiece* input,
+                                  const re2::RE2& pattern,
+                                  re2::StringPiece* skipped_input,
+                                  re2::StringPiece* args[],
+                                  int argc) {
+  re2::StringPiece old_input = *input;
+
+  CHECK_GE(argc, 1);
+  re2::RE2::Arg a0(argc > 0 ? args[0] : nullptr);
+  re2::RE2::Arg a1(argc > 1 ? args[1] : nullptr);
+  re2::RE2::Arg a2(argc > 2 ? args[2] : nullptr);
+  const re2::RE2::Arg* const wrapped_args[] = {&a0, &a1, &a2};
+  CHECK_LE(argc, 3);
+
+  bool result = re2::RE2::FindAndConsumeN(input, pattern, wrapped_args, argc);
+
+  if (skipped_input && result) {
+    size_t bytes_skipped = args[0]->data() - old_input.data();
+    *skipped_input = re2::StringPiece(old_input.data(), bytes_skipped);
+  }
+  return result;
+}
+
+// All |match_groups| need to be of type re2::StringPiece*.
+template <typename... Arg>
+bool FindAndConsumeAndGetSkipped(re2::StringPiece* input,
+                                 const re2::RE2& pattern,
+                                 re2::StringPiece* skipped_input,
+                                 Arg*... match_groups) {
+  re2::StringPiece* args[] = {match_groups...};
+  return FindAndConsumeAndGetSkippedN(input, pattern, skipped_input, args,
+                                      arraysize(args));
+}
+
 }  // namespace
 
 AnonymizerTool::AnonymizerTool()
-    : custom_patterns_(arraysize(kCustomPatterns)) {}
+    : custom_patterns_with_context_(arraysize(kCustomPatternsWithContext)),
+      custom_patterns_without_context_(
+          arraysize(kCustomPatternsWithoutContext)) {}
 
 AnonymizerTool::~AnonymizerTool() {}
 
@@ -56,48 +234,56 @@
   return anonymized;
 }
 
+RE2* AnonymizerTool::GetRegExp(const std::string& pattern) {
+  if (regexp_cache_.find(pattern) == regexp_cache_.end()) {
+    RE2::Options options;
+    // set_multiline of pcre is not supported by RE2, yet.
+    options.set_dot_nl(true);  // Dot matches a new line.
+    scoped_ptr<RE2> re = make_scoped_ptr(new RE2(pattern, options));
+    DCHECK_EQ(re2::RE2::NoError, re->error_code())
+        << "Failed to parse:\n" << pattern << "\n" << re->error();
+    regexp_cache_[pattern] = std::move(re);
+  }
+  return regexp_cache_[pattern].get();
+}
+
 std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
   // This regular expression finds the next MAC address. It splits the data into
-  // a section preceding the MAC address, an OUI (Organizationally Unique
-  // Identifier) part and a NIC (Network Interface Controller) specific part.
+  // an OUI (Organizationally Unique Identifier) part and a NIC (Network
+  // Interface Controller) specific part.
 
-  RE2::Options options;
-  // set_multiline of pcre is not supported by RE2, yet.
-  options.set_dot_nl(true);  // Dot matches a new line.
-  RE2 mac_re(
-      "(.*?)("
-      "[0-9a-fA-F][0-9a-fA-F]:"
+  RE2* mac_re = GetRegExp(
+      "([0-9a-fA-F][0-9a-fA-F]:"
       "[0-9a-fA-F][0-9a-fA-F]:"
       "[0-9a-fA-F][0-9a-fA-F]):("
       "[0-9a-fA-F][0-9a-fA-F]:"
       "[0-9a-fA-F][0-9a-fA-F]:"
-      "[0-9a-fA-F][0-9a-fA-F])",
-      options);
+      "[0-9a-fA-F][0-9a-fA-F])");
 
   std::string result;
   result.reserve(input.size());
 
   // Keep consuming, building up a result string as we go.
   re2::StringPiece text(input);
-  std::string pre_mac, oui, nic;
-  while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui),
-                           RE2::Arg(&nic))) {
+  re2::StringPiece skipped;
+  re2::StringPiece pre_mac, oui, nic;
+  while (FindAndConsumeAndGetSkipped(&text, *mac_re, &skipped, &oui, &nic)) {
     // Look up the MAC address in the hash.
-    oui = base::ToLowerASCII(oui);
-    nic = base::ToLowerASCII(nic);
-    std::string mac = oui + ":" + nic;
+    std::string oui_string = base::ToLowerASCII(oui.as_string());
+    std::string nic_string = base::ToLowerASCII(nic.as_string());
+    std::string mac = oui_string + ":" + nic_string;
     std::string replacement_mac = mac_addresses_[mac];
     if (replacement_mac.empty()) {
       // If not found, build up a replacement MAC address by generating a new
       // NIC part.
       int mac_id = mac_addresses_.size();
       replacement_mac = base::StringPrintf(
-          "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16,
+          "%s:%02x:%02x:%02x", oui_string.c_str(), (mac_id & 0x00ff0000) >> 16,
           (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));
       mac_addresses_[mac] = replacement_mac;
     }
 
-    result += pre_mac;
+    skipped.AppendToString(&result);
     result += replacement_mac;
   }
 
@@ -106,43 +292,79 @@
 }
 
 std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {
-  for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {
+  for (size_t i = 0; i < arraysize(kCustomPatternsWithContext); i++) {
     input =
-        AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]);
+        AnonymizeCustomPatternWithContext(input, kCustomPatternsWithContext[i],
+                                          &custom_patterns_with_context_[i]);
+  }
+  for (size_t i = 0; i < arraysize(kCustomPatternsWithoutContext); i++) {
+    input = AnonymizeCustomPatternWithoutContext(
+        input, kCustomPatternsWithoutContext[i],
+        &custom_patterns_without_context_[i]);
   }
   return input;
 }
 
-// static
-std::string AnonymizerTool::AnonymizeCustomPattern(
+std::string AnonymizerTool::AnonymizeCustomPatternWithContext(
     const std::string& input,
     const std::string& pattern,
     std::map<std::string, std::string>* identifier_space) {
-  RE2::Options options;
-  // set_multiline of pcre is not supported by RE2, yet.
-  options.set_dot_nl(true);  // Dot matches a new line.
-  RE2 re("(.*?)" + pattern, options);
-  DCHECK_EQ(4, re.NumberOfCapturingGroups());
+  RE2* re = GetRegExp(pattern);
+  DCHECK_EQ(3, re->NumberOfCapturingGroups());
 
   std::string result;
   result.reserve(input.size());
 
   // Keep consuming, building up a result string as we go.
   re2::StringPiece text(input);
-  std::string pre_match, pre_matched_id, matched_id, post_matched_id;
-  while (RE2::Consume(&text, re, RE2::Arg(&pre_match),
-                      RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id),
-                      RE2::Arg(&post_matched_id))) {
-    std::string replacement_id = (*identifier_space)[matched_id];
+  re2::StringPiece skipped;
+  re2::StringPiece pre_match, pre_matched_id, matched_id, post_matched_id;
+  while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &pre_matched_id,
+                                     &matched_id, &post_matched_id)) {
+    std::string matched_id_as_string = matched_id.as_string();
+    std::string replacement_id = (*identifier_space)[matched_id_as_string];
     if (replacement_id.empty()) {
       replacement_id = base::IntToString(identifier_space->size());
-      (*identifier_space)[matched_id] = replacement_id;
+      (*identifier_space)[matched_id_as_string] = replacement_id;
     }
 
-    result += pre_match;
-    result += pre_matched_id;
+    skipped.AppendToString(&result);
+    pre_matched_id.AppendToString(&result);
     result += replacement_id;
-    result += post_matched_id;
+    post_matched_id.AppendToString(&result);
+  }
+  text.AppendToString(&result);
+  return result;
+}
+
+std::string AnonymizerTool::AnonymizeCustomPatternWithoutContext(
+    const std::string& input,
+    const CustomPatternWithoutContext& pattern,
+    std::map<std::string, std::string>* identifier_space) {
+  RE2* re = GetRegExp(pattern.pattern);
+  DCHECK_EQ(1, re->NumberOfCapturingGroups());
+
+  std::string result;
+  result.reserve(input.size());
+
+  // Keep consuming, building up a result string as we go.
+  re2::StringPiece text(input);
+  re2::StringPiece skipped;
+  re2::StringPiece matched_id;
+  while (FindAndConsumeAndGetSkipped(&text, *re, &skipped, &matched_id)) {
+    std::string matched_id_as_string = matched_id.as_string();
+    std::string replacement_id = (*identifier_space)[matched_id_as_string];
+    if (replacement_id.empty()) {
+      // The weird Uint64toString trick is because Windows does not like to deal
+      // with %zu and a size_t in printf, nor does it support %llu.
+      replacement_id = base::StringPrintf(
+          "<%s: %s>", pattern.alias,
+          base::Uint64ToString(identifier_space->size()).c_str());
+      (*identifier_space)[matched_id_as_string] = replacement_id;
+    }
+
+    skipped.AppendToString(&result);
+    result += replacement_id;
   }
   text.AppendToString(&result);
   return result;
diff --git a/components/feedback/anonymizer_tool.h b/components/feedback/anonymizer_tool.h
index 54a690f..d41b13b 100644
--- a/components/feedback/anonymizer_tool.h
+++ b/components/feedback/anonymizer_tool.h
@@ -9,10 +9,24 @@
 #include <string>
 #include <vector>
 
-#include <base/macros.h>
+#include "base/macros.h"
+#include "base/memory/scoped_ptr.h"
+
+namespace re2 {
+class RE2;
+}
 
 namespace feedback {
 
+struct CustomPatternWithoutContext {
+  // A string literal used in anonymized tests. Matches to the |pattern| are
+  // replaced with <|alias|: 1>, <|alias|: 2>, ...
+  const char* alias;
+  // A RE2 regexp with exactly one capture group. Matches will be replaced by
+  // the alias reference described above.
+  const char* pattern;
+};
+
 class AnonymizerTool {
  public:
   AnonymizerTool();
@@ -25,12 +39,18 @@
  private:
   friend class AnonymizerToolTest;
 
+  re2::RE2* GetRegExp(const std::string& pattern);
+
   std::string AnonymizeMACAddresses(const std::string& input);
   std::string AnonymizeCustomPatterns(std::string input);
-  static std::string AnonymizeCustomPattern(
+  std::string AnonymizeCustomPatternWithContext(
       const std::string& input,
       const std::string& pattern,
       std::map<std::string, std::string>* identifier_space);
+  std::string AnonymizeCustomPatternWithoutContext(
+      const std::string& input,
+      const CustomPatternWithoutContext& pattern,
+      std::map<std::string, std::string>* identifier_space);
 
   // Map of MAC addresses discovered in anonymized strings to anonymized
   // representations. 11:22:33:44:55:66 gets anonymized to 11:22:33:00:00:01,
@@ -40,9 +60,15 @@
   std::map<std::string, std::string> mac_addresses_;
 
   // Like mac addresses, identifiers in custom patterns are anonymized.
-  // custom_patterns_[i] contains a map of original identifier to anonymized
-  // identifier for custom pattern number i.
-  std::vector<std::map<std::string, std::string>> custom_patterns_;
+  // custom_patterns_with_context_[i] contains a map of original identifier to
+  // anonymized identifier for custom pattern number i.
+  std::vector<std::map<std::string, std::string>> custom_patterns_with_context_;
+  std::vector<std::map<std::string, std::string>>
+      custom_patterns_without_context_;
+
+  // Cache to prevent the repeated compilation of the same regular expression
+  // pattern. Key is the string representation of the RegEx.
+  std::map<std::string, scoped_ptr<re2::RE2>> regexp_cache_;
 
   DISALLOW_COPY_AND_ASSIGN(AnonymizerTool);
 };
diff --git a/components/feedback/anonymizer_tool_unittest.cc b/components/feedback/anonymizer_tool_unittest.cc
index 68f35a8..2e89137 100644
--- a/components/feedback/anonymizer_tool_unittest.cc
+++ b/components/feedback/anonymizer_tool_unittest.cc
@@ -6,6 +6,8 @@
 
 #include <gtest/gtest.h>
 
+#include "base/strings/string_util.h"
+
 namespace feedback {
 
 class AnonymizerToolTest : public testing::Test {
@@ -18,11 +20,19 @@
     return anonymizer_.AnonymizeCustomPatterns(input);
   }
 
-  static std::string AnonymizeCustomPattern(
+  std::string AnonymizeCustomPatternWithContext(
       const std::string& input,
       const std::string& pattern,
       std::map<std::string, std::string>* space) {
-    return AnonymizerTool::AnonymizeCustomPattern(input, pattern, space);
+    return anonymizer_.AnonymizeCustomPatternWithContext(input, pattern, space);
+  }
+
+  std::string AnonymizeCustomPatternWithoutContext(
+      const std::string& input,
+      const CustomPatternWithoutContext& pattern,
+      std::map<std::string, std::string>* space) {
+    return anonymizer_.AnonymizeCustomPatternWithoutContext(input, pattern,
+                                                            space);
   }
 
   AnonymizerTool anonymizer_;
@@ -84,26 +94,86 @@
   EXPECT_EQ(
       "a\nb [SSID=1] [SSID=2] [SSID=foo\nbar] b",
       AnonymizeCustomPatterns("a\nb [SSID=foo] [SSID=bar] [SSID=foo\nbar] b"));
+
+  EXPECT_EQ("<email: 1>",
+            AnonymizeCustomPatterns("[email protected]"));
+  EXPECT_EQ("Email: <email: 1>.",
+            AnonymizeCustomPatterns("Email: [email protected]."));
+  EXPECT_EQ("Email:\n<email: 2>\n",
+            AnonymizeCustomPatterns("Email:\[email protected]\n"));
+
+  EXPECT_EQ("[<IPv6: 1>]", AnonymizeCustomPatterns(
+                               "[2001:0db8:0000:0000:0000:ff00:0042:8329]"));
+  EXPECT_EQ("[<IPv6: 2>]",
+            AnonymizeCustomPatterns("[2001:db8:0:0:0:ff00:42:8329]"));
+  EXPECT_EQ("[<IPv6: 3>]", AnonymizeCustomPatterns("[2001:db8::ff00:42:8329]"));
+  EXPECT_EQ("[<IPv6: 4>]", AnonymizeCustomPatterns("[::1]"));
+  EXPECT_EQ("<IPv4: 1>", AnonymizeCustomPatterns("192.168.0.1"));
+
+  EXPECT_EQ("<URL: 1>",
+            AnonymizeCustomPatterns("http://example.com/foo?test=1"));
+  EXPECT_EQ("Foo <URL: 2> Bar",
+            AnonymizeCustomPatterns("Foo http://192.168.0.1/foo?test=1#123 Bar"));
+  const char* kURLs[] = {
+    "http://example.com/foo?test=1",
+    "http://userid:[email protected]:8080",
+    "http://userid:[email protected]:8080/",
+    "http://@example.com",
+    "http://192.168.0.1",
+    "http://192.168.0.1/",
+    "http://اختبار.com",
+    "http://test.com/foo(bar)baz.html",
+    "http://test.com/foo%20bar",
+    "ftp://test:[email protected]",
+    "chrome://extensions/",
+    "chrome-extension://aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/options.html",
+    "http://example.com/[email protected]",
+  };
+  for (size_t i = 0; i < arraysize(kURLs); ++i) {
+    SCOPED_TRACE(kURLs[i]);
+    std::string got = AnonymizeCustomPatterns(kURLs[i]);
+    EXPECT_TRUE(
+        base::StartsWith(got, "<URL: ", base::CompareCase::INSENSITIVE_ASCII));
+    EXPECT_TRUE(base::EndsWith(got, ">", base::CompareCase::INSENSITIVE_ASCII));
+  }
+  // Test that "Android:" is not considered a schema with empty hier part.
+  EXPECT_EQ("The following applies to Android:",
+            AnonymizeCustomPatterns("The following applies to Android:"));
 }
 
-TEST_F(AnonymizerToolTest, AnonymizeCustomPattern) {
+TEST_F(AnonymizerToolTest, AnonymizeCustomPatternWithContext) {
   const char kPattern[] = "(\\b(?i)id:? ')(\\d+)(')";
   std::map<std::string, std::string> space;
-  EXPECT_EQ("", AnonymizeCustomPattern("", kPattern, &space));
+  EXPECT_EQ("", AnonymizeCustomPatternWithContext("", kPattern, &space));
   EXPECT_EQ("foo\nbar\n",
-            AnonymizeCustomPattern("foo\nbar\n", kPattern, &space));
-  EXPECT_EQ("id '1'", AnonymizeCustomPattern("id '2345'", kPattern, &space));
-  EXPECT_EQ("id '2'", AnonymizeCustomPattern("id '1234'", kPattern, &space));
-  EXPECT_EQ("id: '2'", AnonymizeCustomPattern("id: '1234'", kPattern, &space));
-  EXPECT_EQ("ID: '1'", AnonymizeCustomPattern("ID: '2345'", kPattern, &space));
+            AnonymizeCustomPatternWithContext("foo\nbar\n", kPattern, &space));
+  EXPECT_EQ("id '1'",
+            AnonymizeCustomPatternWithContext("id '2345'", kPattern, &space));
+  EXPECT_EQ("id '2'",
+            AnonymizeCustomPatternWithContext("id '1234'", kPattern, &space));
+  EXPECT_EQ("id: '2'",
+            AnonymizeCustomPatternWithContext("id: '1234'", kPattern, &space));
+  EXPECT_EQ("ID: '1'",
+            AnonymizeCustomPatternWithContext("ID: '2345'", kPattern, &space));
   EXPECT_EQ("x1 id '1' 1x id '2'\nid '1'\n",
-            AnonymizeCustomPattern("x1 id '2345' 1x id '1234'\nid '2345'\n",
-                                   kPattern, &space));
+            AnonymizeCustomPatternWithContext(
+                "x1 id '2345' 1x id '1234'\nid '2345'\n", kPattern, &space));
   space.clear();
-  EXPECT_EQ("id '1'", AnonymizeCustomPattern("id '1234'", kPattern, &space));
+  EXPECT_EQ("id '1'",
+            AnonymizeCustomPatternWithContext("id '1234'", kPattern, &space));
 
   space.clear();
-  EXPECT_EQ("x1z", AnonymizeCustomPattern("xyz", "()(y+)()", &space));
+  EXPECT_EQ("x1z",
+            AnonymizeCustomPatternWithContext("xyz", "()(y+)()", &space));
+}
+
+TEST_F(AnonymizerToolTest, AnonymizeCustomPatternWithoutContext) {
+  CustomPatternWithoutContext kPattern = {"pattern", "(o+)"};
+  std::map<std::string, std::string> space;
+  EXPECT_EQ("", AnonymizeCustomPatternWithoutContext("", kPattern, &space));
+  EXPECT_EQ("f<pattern: 1>\nf<pattern: 2>z\nf<pattern: 1>l\n",
+            AnonymizeCustomPatternWithoutContext("fo\nfooz\nfol\n", kPattern,
+                                                 &space));
 }
 
 }  // namespace feedback