Track DNS Server Success and Failure time and use it in next server selection.
BUG=110197
Review URL: https://chromiumcodereview.appspot.com/16357018
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@204915 0039d316-1c4b-4281-b951-d872f2087c98
diff --git a/net/dns/dns_session.cc b/net/dns/dns_session.cc
index 6d8d0db..2e6d0c9 100644
--- a/net/dns/dns_session.cc
+++ b/net/dns/dns_session.cc
@@ -6,6 +6,7 @@
#include "base/basictypes.h"
#include "base/bind.h"
+#include "base/lazy_instance.h"
#include "base/metrics/histogram.h"
#include "base/metrics/sample_vector.h"
#include "base/rand_util.h"
@@ -32,6 +33,40 @@
const unsigned kRTOPercentile = 99;
} // namespace
+// Runtime statistics of DNS server.
+struct DnsSession::ServerStats {
+ ServerStats(base::TimeDelta rtt_estimate_param, RttBuckets* buckets)
+ : last_failure_count(0), rtt_estimate(rtt_estimate_param) {
+ rtt_histogram.reset(new base::SampleVector(buckets));
+ }
+
+ // Count of consecutive failures after last success.
+ int last_failure_count;
+
+ // Last time when server returned failure or timeout.
+ base::Time last_failure;
+ // Last time when server returned success.
+ base::Time last_success;
+
+ // Estimated RTT using moving average.
+ base::TimeDelta rtt_estimate;
+ // Estimated error in the above.
+ base::TimeDelta rtt_deviation;
+
+ // A histogram of observed RTT .
+ scoped_ptr<base::SampleVector> rtt_histogram;
+
+ DISALLOW_COPY_AND_ASSIGN(ServerStats);
+};
+
+// static
+base::LazyInstance<DnsSession::RttBuckets>::Leaky DnsSession::rtt_buckets_ =
+ LAZY_INSTANCE_INITIALIZER;
+
+DnsSession::RttBuckets::RttBuckets() : base::BucketRanges(kRTTBucketCount + 1) {
+ base::Histogram::InitializeBucketRanges(1, 5000, kRTTBucketCount, this);
+}
+
DnsSession::SocketLease::SocketLease(scoped_refptr<DnsSession> session,
unsigned server_index,
scoped_ptr<DatagramClientSocket> socket)
@@ -49,33 +84,79 @@
socket_pool_(socket_pool.Pass()),
rand_callback_(base::Bind(rand_int_callback, 0, kuint16max)),
net_log_(net_log),
- server_index_(0),
- rtt_estimates_(config_.nameservers.size(), config_.timeout),
- rtt_deviations_(config_.nameservers.size()),
- rtt_buckets_(new base::BucketRanges(kRTTBucketCount + 1)) {
+ server_index_(0) {
socket_pool_->Initialize(&config_.nameservers, net_log);
-
- // TODO(mef): This could be done once per process lifetime.
- base::Histogram::InitializeBucketRanges(1, 5000, kRTTBucketCount,
- rtt_buckets_.get());
+ UMA_HISTOGRAM_CUSTOM_COUNTS(
+ "AsyncDNS.ServerCount", config_.nameservers.size(), 0, 10, 10);
for (size_t i = 0; i < config_.nameservers.size(); ++i) {
- rtt_histograms_.push_back(new base::SampleVector(rtt_buckets_.get()));
+ server_stats_.push_back(new ServerStats(config_.timeout,
+ rtt_buckets_.Pointer()));
}
}
-DnsSession::~DnsSession() {}
+DnsSession::~DnsSession() {
+ RecordServerStats();
+}
int DnsSession::NextQueryId() const { return rand_callback_.Run(); }
-int DnsSession::NextFirstServerIndex() {
- int index = server_index_;
+unsigned DnsSession::NextFirstServerIndex() {
+ unsigned index = NextGoodServerIndex(server_index_);
if (config_.rotate)
server_index_ = (server_index_ + 1) % config_.nameservers.size();
return index;
}
+unsigned DnsSession::NextGoodServerIndex(unsigned server_index) {
+ unsigned index = server_index;
+ base::Time oldest_server_failure(base::Time::Now());
+ unsigned oldest_server_failure_index = 0;
+
+ UMA_HISTOGRAM_BOOLEAN("AsyncDNS.ServerIsGood",
+ server_stats_[server_index]->last_failure.is_null());
+
+ do {
+ base::Time cur_server_failure = server_stats_[index]->last_failure;
+ // If number of failures on this server doesn't exceed number of allowed
+ // attempts, return its index.
+ if (server_stats_[server_index]->last_failure_count < config_.attempts) {
+ return index;
+ }
+ // Track oldest failed server.
+ if (cur_server_failure < oldest_server_failure) {
+ oldest_server_failure = cur_server_failure;
+ oldest_server_failure_index = index;
+ }
+ index = (index + 1) % config_.nameservers.size();
+ } while (index != server_index);
+
+ // If we are here it means that there are no successful servers, so we have
+ // to use one that has failed oldest.
+ return oldest_server_failure_index;
+}
+
+void DnsSession::RecordServerFailure(unsigned server_index) {
+ UMA_HISTOGRAM_CUSTOM_COUNTS(
+ "AsyncDNS.ServerFailureIndex", server_index, 0, 10, 10);
+ ++(server_stats_[server_index]->last_failure_count);
+ server_stats_[server_index]->last_failure = base::Time::Now();
+}
+
+void DnsSession::RecordServerSuccess(unsigned server_index) {
+ if (server_stats_[server_index]->last_success.is_null()) {
+ UMA_HISTOGRAM_COUNTS_100("AsyncDNS.ServerFailuresAfterNetworkChange",
+ server_stats_[server_index]->last_failure_count);
+ } else {
+ UMA_HISTOGRAM_COUNTS_100("AsyncDNS.ServerFailuresBeforeSuccess",
+ server_stats_[server_index]->last_failure_count);
+ }
+ server_stats_[server_index]->last_failure_count = 0;
+ server_stats_[server_index]->last_failure = base::Time();
+ server_stats_[server_index]->last_success = base::Time::Now();
+}
+
void DnsSession::RecordRTT(unsigned server_index, base::TimeDelta rtt) {
- DCHECK_LT(server_index, rtt_histograms_.size());
+ DCHECK_LT(server_index, server_stats_.size());
// For measurement, assume it is the first attempt (no backoff).
base::TimeDelta timeout_jacobson = NextTimeoutFromJacobson(server_index, 0);
@@ -90,8 +171,8 @@
// Jacobson/Karels algorithm for TCP.
// Using parameters: alpha = 1/8, delta = 1/4, beta = 4
- base::TimeDelta& estimate = rtt_estimates_[server_index];
- base::TimeDelta& deviation = rtt_deviations_[server_index];
+ base::TimeDelta& estimate = server_stats_[server_index]->rtt_estimate;
+ base::TimeDelta& deviation = server_stats_[server_index]->rtt_deviation;
base::TimeDelta current_error = rtt - estimate;
estimate += current_error / 8; // * alpha
base::TimeDelta abs_error = base::TimeDelta::FromInternalValue(
@@ -99,7 +180,8 @@
deviation += (abs_error - deviation) / 4; // * delta
// Histogram-based method.
- rtt_histograms_[server_index]->Accumulate(rtt.InMilliseconds(), 1);
+ server_stats_[server_index]->rtt_histogram
+ ->Accumulate(rtt.InMilliseconds(), 1);
}
void DnsSession::RecordLostPacket(unsigned server_index, int attempt) {
@@ -111,10 +193,28 @@
UMA_HISTOGRAM_TIMES("AsyncDNS.TimeoutSpentHistogram", timeout_histogram);
}
+void DnsSession::RecordServerStats() {
+ for (size_t index = 0; index < server_stats_.size(); ++index) {
+ if (server_stats_[index]->last_failure_count) {
+ if (server_stats_[index]->last_success.is_null()) {
+ UMA_HISTOGRAM_COUNTS("AsyncDNS.ServerFailuresWithoutSuccess",
+ server_stats_[index]->last_failure_count);
+ } else {
+ UMA_HISTOGRAM_COUNTS("AsyncDNS.ServerFailuresAfterSuccess",
+ server_stats_[index]->last_failure_count);
+ }
+ }
+ }
+}
+
+
base::TimeDelta DnsSession::NextTimeout(unsigned server_index, int attempt) {
- DCHECK_LT(server_index, rtt_histograms_.size());
+ DCHECK_LT(server_index, server_stats_.size());
base::TimeDelta timeout = config_.timeout;
+ // If this server has not responded successfully, then don't wait too long.
+ if (server_stats_[server_index]->last_success.is_null())
+ return timeout;
// The timeout doubles every full round (each nameserver once).
unsigned num_backoffs = attempt / config_.nameservers.size();
@@ -155,10 +255,10 @@
base::TimeDelta DnsSession::NextTimeoutFromJacobson(unsigned server_index,
int attempt) {
- DCHECK_LT(server_index, rtt_estimates_.size());
+ DCHECK_LT(server_index, server_stats_.size());
- base::TimeDelta timeout =
- rtt_estimates_[server_index] + 4 * rtt_deviations_[server_index];
+ base::TimeDelta timeout = server_stats_[server_index]->rtt_estimate +
+ 4 * server_stats_[server_index]->rtt_deviation;
timeout = std::max(timeout, base::TimeDelta::FromMilliseconds(kMinTimeoutMs));
@@ -171,23 +271,25 @@
base::TimeDelta DnsSession::NextTimeoutFromHistogram(unsigned server_index,
int attempt) {
- DCHECK_LT(server_index, rtt_histograms_.size());
+ DCHECK_LT(server_index, server_stats_.size());
COMPILE_ASSERT(std::numeric_limits<base::HistogramBase::Count>::is_signed,
histogram_base_count_assumed_to_be_signed);
// Use fixed percentile of observed samples.
- const base::SampleVector& samples = *rtt_histograms_[server_index];
+ const base::SampleVector& samples =
+ *server_stats_[server_index]->rtt_histogram;
+
base::HistogramBase::Count total = samples.TotalCount();
base::HistogramBase::Count remaining_count = kRTOPercentile * total / 100;
size_t index = 0;
- while (remaining_count > 0 && index < rtt_buckets_->size()) {
+ while (remaining_count > 0 && index < rtt_buckets_.Get().size()) {
remaining_count -= samples.GetCountAtIndex(index);
++index;
}
base::TimeDelta timeout =
- base::TimeDelta::FromMilliseconds(rtt_buckets_->range(index));
+ base::TimeDelta::FromMilliseconds(rtt_buckets_.Get().range(index));
timeout = std::max(timeout, base::TimeDelta::FromMilliseconds(kMinTimeoutMs));