anonymizer_tool: IPv6 sanitization improvements.

This patch whitelists certain special purpose IPv6 subnets and prevents
spurious matches from being treated as scrubbable IPv6 addresses,
as per go/cros_ip_logsanitizer.

BUG=b:111048642
TEST=components_unittests

BUG: 908117
Change-Id: If8ee24aba1ca5ac9a077dcb1cfa88ac9a69807f5
Reviewed-on: https://chromium-review.googlesource.com/c/1424626
Commit-Queue: Garrick Evans <[email protected]>
Reviewed-by: Dominic Battré <[email protected]>
Reviewed-by: Thiemo Nagel <[email protected]>
Auto-Submit: Garrick Evans <[email protected]>
Cr-Commit-Position: refs/heads/master@{#626507}
diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc
index 39ea9cdd..7fbf845 100644
--- a/components/feedback/anonymizer_tool.cc
+++ b/components/feedback/anonymizer_tool.cc
@@ -7,7 +7,7 @@
 #include <memory>
 #include <utility>
 
-#include "base/stl_util.h"
+#include "base/strings/strcat.h"
 #include "base/strings/string_number_conversions.h"
 #include "base/strings/string_util.h"
 #include "base/strings/stringprintf.h"
@@ -59,13 +59,46 @@
     "(?i-s)(serial\\s*(?:number)?\\s*[:=]\\s*)([0-9a-zA-Z\\-\"]+)()",
 };
 
-// Returns the number of leading bytes that may be kept unsanitized.
-std::string MaybeScrubIPv4Address(const std::string& addr) {
+bool MaybeUnmapAddress(net::IPAddress* addr) {
+  if (!addr->IsIPv4MappedIPv6())
+    return false;
+
+  *addr = net::ConvertIPv4MappedIPv6ToIPv4(*addr);
+  return true;
+}
+
+bool MaybeUntranslateAddress(net::IPAddress* addr) {
+  if (!addr->IsIPv6())
+    return false;
+
+  static const net::IPAddress kTranslated6To4(0, 0x64, 0xff, 0x9b, 0, 0, 0, 0,
+                                              0, 0, 0, 0, 0, 0, 0, 0);
+  if (!IPAddressMatchesPrefix(*addr, kTranslated6To4, 96))
+    return false;
+
+  const auto bytes = addr->bytes();
+  *addr = net::IPAddress(bytes[12], bytes[13], bytes[14], bytes[15]);
+  return true;
+}
+
+// If |addr| points to a valid IPv6 address, this function truncates it at /32.
+bool MaybeTruncateIPv6(net::IPAddress* addr) {
+  if (!addr->IsIPv6())
+    return false;
+
+  const auto bytes = addr->bytes();
+  *addr = net::IPAddress(bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 0);
+  return true;
+}
+
+// Returns an appropriately scrubbed version of |addr| if applicable.
+std::string MaybeScrubIPAddress(const std::string& addr) {
   struct {
     net::IPAddress ip_addr;
     int prefix_length;
     bool scrub;
-  } static const kWhitelistedIPv4Ranges[] = {
+  } static const kWhitelistedIPRanges[] = {
       // Private.
       {net::IPAddress(10, 0, 0, 0), 8, true},
       {net::IPAddress(172, 16, 0, 0), 12, true},
@@ -84,20 +117,57 @@
       {net::IPAddress(224, 0, 0, 0), 4, true},
       // Link local.
       {net::IPAddress(169, 254, 0, 0), 16, true},
+      {net::IPAddress(0xfe, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 10,
+       true},
       // Broadcast.
       {net::IPAddress(255, 255, 255, 255), 32, false},
+      // IPv6 loopback, unspecified and non-address strings.
+      {net::IPAddress::IPv6AllZeros(), 112, false},
+      // IPv6 multicast all nodes and routers.
+      {net::IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
+       128, false},
+      {net::IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2),
+       128, false},
+      {net::IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
+       128, false},
+      {net::IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2),
+       128, false},
+      // IPv6 other multicast (link and interface local).
+      {net::IPAddress(0xff, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 16,
+       true},
+      {net::IPAddress(0xff, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 16,
+       true},
+
   };
   net::IPAddress input_addr;
-  if (input_addr.AssignFromIPLiteral(addr) && input_addr.IsIPv4()) {
-    for (const auto& range : kWhitelistedIPv4Ranges) {
+  if (input_addr.AssignFromIPLiteral(addr) && input_addr.IsValid()) {
+    bool mapped = MaybeUnmapAddress(&input_addr);
+    bool translated = !mapped ? MaybeUntranslateAddress(&input_addr) : false;
+    for (const auto& range : kWhitelistedIPRanges) {
       if (IPAddressMatchesPrefix(input_addr, range.ip_addr,
                                  range.prefix_length)) {
-        return range.scrub ? base::StringPrintf(
-                                 "%s/%d", range.ip_addr.ToString().c_str(),
-                                 range.prefix_length)
-                           : addr;
+        std::string prefix;
+        std::string out_addr = addr;
+        if (mapped) {
+          prefix = "M ";
+          out_addr = input_addr.ToString();
+        } else if (translated) {
+          prefix = "T ";
+          out_addr = input_addr.ToString();
+        }
+        if (range.scrub) {
+          out_addr = base::StringPrintf(
+              "%s/%d", range.ip_addr.ToString().c_str(), range.prefix_length);
+        }
+        return base::StrCat({prefix, out_addr});
       }
     }
+    // |addr| may have been over-aggressively matched as an IPv6 address when
+    // it's really just an arbitrary part of a sentence. If the string is the
+    // same as the coarsely truncated address then keep it because even if
+    // it happens to be a real address, there is no loss of anonymity.
+    if (MaybeTruncateIPv6(&input_addr) && input_addr.ToString() == addr)
+      return addr;
   }
   return "";
 }
@@ -421,7 +491,7 @@
     std::string matched_id_as_string = matched_id.as_string();
     std::string replacement_id = (*identifier_space)[matched_id_as_string];
     if (replacement_id.empty()) {
-      replacement_id = MaybeScrubIPv4Address(matched_id_as_string);
+      replacement_id = MaybeScrubIPAddress(matched_id_as_string);
       if (replacement_id != matched_id_as_string) {
         // The weird Uint64toString trick is because Windows does not like
         // to deal with %zu and a size_t in printf, nor does it support %llu.
diff --git a/components/feedback/anonymizer_tool_unittest.cc b/components/feedback/anonymizer_tool_unittest.cc
index 6464e02..190dfad 100644
--- a/components/feedback/anonymizer_tool_unittest.cc
+++ b/components/feedback/anonymizer_tool_unittest.cc
@@ -6,7 +6,6 @@
 
 #include <gtest/gtest.h>
 
-#include "base/stl_util.h"
 #include "base/strings/string_util.h"
 
 namespace feedback {
@@ -132,7 +131,7 @@
   EXPECT_EQ("[<IPv6: 2>]",
             AnonymizeCustomPatterns("[2001:db8:0:0:0:ff00:42:8329]"));
   EXPECT_EQ("[<IPv6: 3>]", AnonymizeCustomPatterns("[2001:db8::ff00:42:8329]"));
-  EXPECT_EQ("[<IPv6: 4>]", AnonymizeCustomPatterns("[::1]"));
+  EXPECT_EQ("[<IPv6: 4>]", AnonymizeCustomPatterns("[aa::bb]"));
   EXPECT_EQ("<IPv4: 1>", AnonymizeCustomPatterns("192.160.0.1"));
 
   EXPECT_EQ("<URL: 1>",
@@ -228,6 +227,10 @@
       "192.169.2.120\n"           // IP address.
       "169.254.0.1\n"             // Link local.
       "169.200.0.1\n"             // IP address.
+      "fe80::\n"                  // Link local.
+      "fe80::ffff\n"              // Link local.
+      "febf:ffff::ffff\n"         // Link local.
+      "fecc::1111\n"              // IP address.
       "224.0.0.24\n"              // Multicast.
       "240.0.0.0\n"               // IP address.
       "255.255.255.255\n"         // Broadcast.
@@ -243,7 +246,30 @@
       "11:11;11::11\n"            // IP address.
       "11::11\n"                  // IP address.
       "11:11:abcdef:0:0:0:0:0\n"  // No PII.
-      "aa:aa:aa:aa:aa:aa";        // MAC address (BSSID).
+      "::\n"                      // Unspecified.
+      "::1\n"                     // Local host.
+      "Instance::Set\n"           // Ignore match, no PII.
+      "Instant::ff\n"             // Ignore match, no PII.
+      "net::ERR_CONN_TIMEOUT\n"   // Ignore match, no PII.
+      "ff01::1\n"                 // All nodes address (interface local).
+      "ff01::2\n"                 // All routers (interface local).
+      "ff01::3\n"                 // Multicast (interface local).
+      "ff02::1\n"                 // All nodes address (link local).
+      "ff02::2\n"                 // All routers (link local).
+      "ff02::3\n"                 // Multicast (link local).
+      "ff02::fb\n"                // mDNSv6 (link local).
+      "ff08::fb\n"                // mDNSv6.
+      "ff0f::101\n"               // All NTP servers.
+      "::ffff:cb0c:10ea\n"        // IPv4-mapped IPV6 (IP address).
+      "::ffff:a0a:a0a\n"          // IPv4-mapped IPV6 (private class A).
+      "::ffff:a0a:a0a\n"          // Intentional duplicate.
+      "::ffff:ac1e:1e1e\n"        // IPv4-mapped IPV6 (private class B).
+      "::ffff:c0a8:640a\n"        // IPv4-mapped IPV6 (private class C).
+      "::ffff:6473:5c01\n"        // IPv4-mapped IPV6 (Chrome).
+      "64:ff9b::a0a:a0a\n"       // IPv4-translated 6to4 IPV6 (private class A).
+      "64:ff9b::6473:5c01\n"     // IPv4-translated 6to4 IPV6 (Chrome).
+      "::0101:ffff:c0a8:640a\n"  // IP address.
+      "aa:aa:aa:aa:aa:aa";       // MAC address (BSSID).
   std::string result =
       "aaaaaaaa [SSID=1]aaaaa\n"
       "aaaaaaaa<URL: 1>\n"
@@ -268,6 +294,10 @@
       "<IPv4: 16>\n"
       "<169.254.0.0/16: 17>\n"
       "<IPv4: 18>\n"
+      "<fe80::/10: 1>\n"
+      "<fe80::/10: 2>\n"
+      "<fe80::/10: 3>\n"
+      "<IPv6: 4>\n"
       "<224.0.0.0/4: 19>\n"
       "<IPv4: 20>\n"
       "255.255.255.255\n"
@@ -280,9 +310,32 @@
       "255.255.259.255\n"
       "255.300.255.255\n"
       "aaaa<IPv4: 28>aaa\n"
-      "11:11;<IPv6: 1>\n"
-      "<IPv6: 1>\n"
+      "11:11;<IPv6: 5>\n"
+      "<IPv6: 5>\n"
       "11:11:abcdef:0:0:0:0:0\n"
+      "::\n"
+      "::1\n"
+      "Instance::Set\n"
+      "Instant::ff\n"
+      "net::ERR_CONN_TIMEOUT\n"
+      "ff01::1\n"
+      "ff01::2\n"
+      "<ff01::/16: 13>\n"
+      "ff02::1\n"
+      "ff02::2\n"
+      "<ff02::/16: 16>\n"
+      "<ff02::/16: 17>\n"
+      "<IPv6: 18>\n"
+      "<IPv6: 19>\n"
+      "<IPv6: 20>\n"
+      "<M 10.0.0.0/8: 21>\n"
+      "<M 10.0.0.0/8: 21>\n"
+      "<M 172.16.0.0/12: 22>\n"
+      "<M 192.168.0.0/16: 23>\n"
+      "<M 100.115.92.1: 24>\n"
+      "<T 10.0.0.0/8: 25>\n"
+      "<T 100.115.92.1: 26>\n"
+      "<IPv6: 27>\n"
       "aa:aa:aa:00:00:01";
   EXPECT_EQ(result, anonymizer_.Anonymize(data));
 }