tools/android/loading/sandwich_prefetch.py - chromium/src - Git at Google

 # Copyright 2016 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """
 Implements a task builder for benchmarking effects of NoState Prefetch.
 Noticeable steps of the task pipeline:
   * Save a WPR archive
   * Process the WPR archive to make all resources cacheable
   * Process cache archive to patch response headers back to their original
       values.
   * Find out which resources are discoverable by NoState Prefetch
       (HTMLPreloadScanner)
   * Load pages with empty/full/prefetched cache
   * Extract most important metrics to a CSV
 """

 import csv
 import logging
 import json
 import os
 import re
 import shutil
 from urlparse import urlparse

 import chrome_cache
 import common_util
 import loading_trace
 from prefetch_view import PrefetchSimulationView
 from request_dependencies_lens import RequestDependencyLens
 import sandwich_metrics
 import sandwich_runner
 import task_manager
 import wpr_backend


 class Discoverer(object):
   # Do not prefetch anything.
   EmptyCache = 'empty-cache'

   # Prefetches everything to load fully from cache (impossible in practice).
   FullCache = 'full-cache'

   # Prefetches the first resource following the redirection chain.
   MainDocument = 'main-document'

   # All resources which are fetched from the main document and their
   # redirections.
   Parser = 'parser'

   # Simulation of HTMLPreloadScanner on the main document and their
   # redirections and subsets:
   #   Store: only resources that don't have Cache-Control: No-Store.
   HTMLPreloadScanner = 'html-scanner'
   HTMLPreloadScannerStore = 'html-scanner-store'


 # List of all available sub-resource discoverers.
 SUBRESOURCE_DISCOVERERS = set([
   Discoverer.EmptyCache,
   Discoverer.FullCache,
   Discoverer.MainDocument,
   Discoverer.Parser,
   Discoverer.HTMLPreloadScanner,
   Discoverer.HTMLPreloadScannerStore,
 ])


 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$')


 def _PatchWpr(wpr_archive):
   """Patches a WPR archive to get all resources into the HTTP cache and avoid
   invalidation and revalidations.

   Args:
     wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch.
   """
   # Sets the resources cache max-age to 10 years.
   MAX_AGE = 10 * 365 * 24 * 60 * 60
   CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE)

   logging.info('number of entries: %d', len(wpr_archive.ListUrlEntries()))
   patched_entry_count = 0
   for url_entry in wpr_archive.ListUrlEntries():
     response_headers = url_entry.GetResponseHeadersDict()
     if 'cache-control' in response_headers and \
         response_headers['cache-control'] == CACHE_CONTROL:
       continue
     # Override the cache-control header to set the resources max age to MAX_AGE.
     #
     # Important note: Some resources holding sensitive information might have
     # cache-control set to no-store which allow the resource to be cached but
     # not cached in the file system. NoState-Prefetch is going to take care of
     # this case. But in here, to simulate NoState-Prefetch, we don't have other
     # choices but save absolutely all cached resources on disk so they survive
     # after killing chrome for cache save, modification and push.
     url_entry.SetResponseHeader('cache-control', CACHE_CONTROL)

     # TODO(gabadie): May need to extend Vary blacklist (referer?)
     #
     # All of these Vary and Pragma possibilities need to be removed from
     # response headers in order for Chrome to store a resource in HTTP cache and
     # not to invalidate it.
     url_entry.RemoveResponseHeaderDirectives('vary', {'*', 'cookie'})
     url_entry.RemoveResponseHeaderDirectives('pragma', {'no-cache'})
     patched_entry_count += 1
   logging.info('number of entries patched: %d', patched_entry_count)


 def _FilterOutDataAndIncompleteRequests(requests):
   for request in filter(lambda r: not r.IsDataRequest(), requests):
     # The protocol is only known once the response has been received. But the
     # trace recording might have been stopped with still some JavaScript
     # originated requests that have not received any responses yet.
     if request.protocol is None:
       assert not request.HasReceivedResponse()
       continue
     if request.protocol not in {'http/0.9', 'http/1.0', 'http/1.1'}:
       raise RuntimeError('Unknown request protocol {}'.format(request.protocol))
     yield request


 def _PatchCacheArchive(cache_archive_path, loading_trace_path,
                        cache_archive_dest_path):
   """Patch the cache archive.

   Note: This method update the raw response headers of cache entries' to store
     the ones such as Set-Cookie that were pruned by the
     net::HttpCacheTransaction, and remove the stream index 2 holding resource's
     compile meta data.

   Args:
     cache_archive_path: Input archive's path to patch.
     loading_trace_path: Path of the loading trace that have recorded the cache
         archive <cache_archive_path>.
     cache_archive_dest_path: Archive destination's path.
   """
   trace = loading_trace.LoadingTrace.FromJsonFile(loading_trace_path)
   with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path:
     cache_path = os.path.join(tmp_path, 'cache')
     chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_path)
     cache_backend = chrome_cache.CacheBackend(cache_path, 'simple')
     cache_entries = set(cache_backend.ListKeys())
     logging.info('Original cache size: %d bytes' % cache_backend.GetSize())
     for request in _FilterOutDataAndIncompleteRequests(
         trace.request_track.GetEvents()):
       # On requests having an upload data stream such as POST requests,
       # net::HttpCache::GenerateCacheKey() prefixes the cache entry's key with
       # the upload data stream's session unique identifier.
       #
       # It is fine to not patch these requests since when reopening Chrome,
       # there is no way the entry can be reused since the upload data stream's
       # identifier will be different.
       #
       # The fact that these entries are kept in the cache after closing Chrome
       # properly by closing the Chrome tab as the ChromeControler.SetSlowDeath()
       # do is known chrome bug (crbug.com/610725).
       if request.url not in cache_entries:
         continue
       # Chrome prunes Set-Cookie from response headers before storing them in
       # disk cache. Also, it adds implicit "Vary: cookie" header to all redirect
       # response headers. Sandwich manages the cache, but between recording the
       # cache and benchmarking the cookie jar is invalidated. This leads to
       # invalidation of all cacheable redirects.
       raw_headers = request.GetRawResponseHeaders()
       cache_backend.UpdateRawResponseHeaders(request.url, raw_headers)
       # NoState-Prefetch would only fetch the resources, but not parse them.
       cache_backend.DeleteStreamForKey(request.url, 2)
     chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path)
     logging.info('Patched cache size: %d bytes' % cache_backend.GetSize())


 def _DiscoverRequests(dependencies_lens, subresource_discoverer):
   trace = dependencies_lens.loading_trace
   first_resource_request = trace.request_track.GetFirstResourceRequest()

   if subresource_discoverer == Discoverer.EmptyCache:
     requests = []
   elif subresource_discoverer == Discoverer.FullCache:
     requests = dependencies_lens.loading_trace.request_track.GetEvents()
   elif subresource_discoverer == Discoverer.MainDocument:
     requests = [dependencies_lens.GetRedirectChain(first_resource_request)[-1]]
   elif subresource_discoverer == Discoverer.Parser:
     requests = PrefetchSimulationView.ParserDiscoverableRequests(
         first_resource_request, dependencies_lens)
   elif subresource_discoverer == Discoverer.HTMLPreloadScanner:
     requests = PrefetchSimulationView.PreloadedRequests(
         first_resource_request, dependencies_lens, trace)
   else:
     assert False
   logging.info('number of requests discovered by %s: %d',
       subresource_discoverer, len(requests))
   return requests


 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests):
   with open(original_headers_path) as file_input:
     original_headers = json.load(file_input)
   pruned_requests = set()
   for request in requests:
     request_original_headers = original_headers[request.url]
     if ('cache-control' in request_original_headers and
         'no-store' in request_original_headers['cache-control'].lower()):
       pruned_requests.add(request)
   return [r for r in requests if r not in pruned_requests]


 def _ExtractDiscoverableUrls(
     original_headers_path, loading_trace_path, subresource_discoverer):
   """Extracts discoverable resource urls from a loading trace according to a
   sub-resource discoverer.

   Args:
     original_headers_path: Path of JSON containing the original headers.
     loading_trace_path: Path of the loading trace recorded at original cache
       creation.
     subresource_discoverer: The sub-resources discoverer that should white-list
       the resources to keep in cache for the NoState-Prefetch benchmarks.

   Returns:
     A set of urls.
   """
   assert subresource_discoverer in SUBRESOURCE_DISCOVERERS, \
       'unknown prefetch simulation {}'.format(subresource_discoverer)
   logging.info('loading %s', loading_trace_path)
   trace = loading_trace.LoadingTrace.FromJsonFile(loading_trace_path)
   dependencies_lens = RequestDependencyLens(trace)

   # Build the list of discovered requests according to the desired simulation.
   discovered_requests = []
   if subresource_discoverer == Discoverer.HTMLPreloadScannerStore:
     requests = _DiscoverRequests(
         dependencies_lens, Discoverer.HTMLPreloadScanner)
     discovered_requests = _PruneOutOriginalNoStoreRequests(
         original_headers_path, requests)
   else:
     discovered_requests = _DiscoverRequests(
         dependencies_lens, subresource_discoverer)

   whitelisted_urls = set()
   for request in _FilterOutDataAndIncompleteRequests(discovered_requests):
     logging.debug('white-listing %s', request.url)
     whitelisted_urls.add(request.url)
   logging.info('number of white-listed resources: %d', len(whitelisted_urls))
   return whitelisted_urls


 def _PrintUrlSetComparison(ref_url_set, url_set, url_set_name):
   """Compare URL sets and log the diffs.

   Args:
     ref_url_set: Set of reference urls.
     url_set: Set of urls to compare to the reference.
     url_set_name: The set name for logging purposes.
   """
   assert type(ref_url_set) == set
   assert type(url_set) == set
   if ref_url_set == url_set:
     logging.info('  %d %s are matching.' % (len(ref_url_set), url_set_name))
     return
   missing_urls = ref_url_set.difference(url_set)
   unexpected_urls = url_set.difference(ref_url_set)
   logging.error('  %s are not matching (expected %d, had %d)' % \
       (url_set_name, len(ref_url_set), len(url_set)))
   logging.error('    List of %d missing resources:' % len(missing_urls))
   for url in sorted(missing_urls):
     logging.error('-     ' + url)
   logging.error('    List of %d unexpected resources:' % len(unexpected_urls))
   for url in sorted(unexpected_urls):
     logging.error('+     ' + url)


 class _RequestOutcome:
   All, ServedFromCache, NotServedFromCache, Post = range(4)


 def _ListUrlRequests(trace, request_kind):
   """Lists requested URLs from a trace.

   Args:
     trace: (loading_trace.LoadingTrace) loading trace.
     request_kind: _RequestOutcome indicating the subset of requests to output.

   Returns:
     set([str])
   """
   urls = set()
   for request_event in _FilterOutDataAndIncompleteRequests(
       trace.request_track.GetEvents()):
     if (request_kind == _RequestOutcome.ServedFromCache and
         request_event.from_disk_cache):
       urls.add(request_event.url)
     elif (request_kind == _RequestOutcome.Post and
         request_event.method.upper().strip() == 'POST'):
       urls.add(request_event.url)
     elif (request_kind == _RequestOutcome.NotServedFromCache and
         not request_event.from_disk_cache):
       urls.add(request_event.url)
     elif request_kind == _RequestOutcome.All:
       urls.add(request_event.url)
   return urls


 class _RunOutputVerifier(object):
   """Object to verify benchmark run from traces and WPR log stored in the
   runner output directory.
   """

   def __init__(self, cache_validation_result, benchmark_setup):
     """Constructor.

     Args:
       cache_validation_result: JSON of the cache validation task.
       benchmark_setup: JSON of the benchmark setup.
     """
     self._cache_whitelist = set(benchmark_setup['cache_whitelist'])
     self._original_requests = set(
         cache_validation_result['effective_encoded_data_lengths'].keys())
     self._original_post_requests = set(
         cache_validation_result['effective_post_requests'])
     self._original_cached_requests = self._original_requests.intersection(
         self._cache_whitelist)
     self._original_uncached_requests = self._original_requests.difference(
         self._cache_whitelist)
     self._all_sent_url_requests = set()

   def VerifyTrace(self, trace):
     """Verifies a trace with the cache validation result and the benchmark
     setup.
     """
     effective_requests = _ListUrlRequests(trace, _RequestOutcome.All)
     effective_post_requests = _ListUrlRequests(trace, _RequestOutcome.Post)
     effective_cached_requests = \
         _ListUrlRequests(trace, _RequestOutcome.ServedFromCache)
     effective_uncached_requests = \
         _ListUrlRequests(trace, _RequestOutcome.NotServedFromCache)

     missing_requests = self._original_requests.difference(effective_requests)
     unexpected_requests = effective_requests.difference(self._original_requests)
     expected_cached_requests = \
         self._original_cached_requests.difference(missing_requests)
     expected_uncached_requests = self._original_uncached_requests.union(
         unexpected_requests).difference(missing_requests)

     # POST requests are known to be unable to use the cache.
     expected_cached_requests.difference_update(effective_post_requests)
     expected_uncached_requests.update(effective_post_requests)

     _PrintUrlSetComparison(self._original_requests, effective_requests,
                            'All resources')
     _PrintUrlSetComparison(set(), effective_post_requests, 'POST resources')
     _PrintUrlSetComparison(expected_cached_requests, effective_cached_requests,
                            'Cached resources')
     _PrintUrlSetComparison(expected_uncached_requests,
                            effective_uncached_requests, 'Non cached resources')

     self._all_sent_url_requests.update(effective_uncached_requests)

   def VerifyWprLog(self, wpr_log_path):
     """Verifies WPR log with previously verified traces."""
     all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)
     all_wpr_urls = set()
     unserved_wpr_urls = set()
     wpr_command_colliding_urls = set()

     for request in all_wpr_requests:
       if request.is_wpr_host:
         continue
       if urlparse(request.url).path.startswith('/web-page-replay'):
         wpr_command_colliding_urls.add(request.url)
       elif request.is_served is False:
         unserved_wpr_urls.add(request.url)
       all_wpr_urls.add(request.url)

     _PrintUrlSetComparison(set(), unserved_wpr_urls,
                            'Distinct unserved resources from WPR')
     _PrintUrlSetComparison(set(), wpr_command_colliding_urls,
                            'Distinct resources colliding to WPR commands')
     _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests,
                            'Distinct resource requests to WPR')


 def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path):
   """Validates a cache archive content.

   Args:
     cache_build_trace_path: Path of the generated trace at the cache build time.
     cache_archive_path: Cache archive's path to validate.

   Returns:
     {
       'effective_encoded_data_lengths':
         {URL of all requests: encoded_data_length},
       'effective_post_requests': [URLs of POST requests],
       'expected_cached_resources': [URLs of resources expected to be cached],
       'successfully_cached': [URLs of cached sub-resources]
     }
   """
   # TODO(gabadie): What's the best way of propagating errors happening in here?
   logging.info('lists cached urls from %s' % cache_archive_path)
   with common_util.TemporaryDirectory() as cache_directory:
     chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory)
     cache_keys = set(
         chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys())
   trace = loading_trace.LoadingTrace.FromJsonFile(cache_build_trace_path)
   effective_requests = _ListUrlRequests(trace, _RequestOutcome.All)
   effective_post_requests = _ListUrlRequests(trace, _RequestOutcome.Post)
   effective_encoded_data_lengths = {}
   for request in _FilterOutDataAndIncompleteRequests(
       trace.request_track.GetEvents()):
     if request.from_disk_cache or request.served_from_cache:
       # At cache archive creation time, a request might be loaded several times,
       # but avoid the request.encoded_data_length == 0 if loaded from cache.
       continue
     if request.url in effective_encoded_data_lengths:
       effective_encoded_data_lengths[request.url] = max(
           effective_encoded_data_lengths[request.url],
           request.GetEncodedDataLength())
     else:
       effective_encoded_data_lengths[request.url] = (
           request.GetEncodedDataLength())

   upload_data_stream_cache_entry_keys = set()
   upload_data_stream_requests = set()
   for cache_entry_key in cache_keys:
     match = _UPLOAD_DATA_STREAM_REQUESTS_REGEX.match(cache_entry_key)
     if not match:
       continue
     upload_data_stream_cache_entry_keys.add(cache_entry_key)
     upload_data_stream_requests.add(match.group('url'))

   expected_cached_requests = effective_requests.difference(
       effective_post_requests)
   effective_cache_keys = cache_keys.difference(
       upload_data_stream_cache_entry_keys)

   _PrintUrlSetComparison(effective_post_requests, upload_data_stream_requests,
                          'POST resources')
   _PrintUrlSetComparison(expected_cached_requests, effective_cache_keys,
                          'Cached resources')

   return {
       'effective_encoded_data_lengths': effective_encoded_data_lengths,
       'effective_post_requests': [url for url in effective_post_requests],
       'expected_cached_resources': [url for url in expected_cached_requests],
       'successfully_cached_resources': [url for url in effective_cache_keys]
   }


 def _ProcessRunOutputDir(
     cache_validation_result, benchmark_setup, runner_output_dir):
   """Process benchmark's run output directory.

   Args:
     cache_validation_result: Same as for _RunOutputVerifier
     benchmark_setup: Same as for _RunOutputVerifier
     runner_output_dir: Same as for SandwichRunner.output_dir

   Returns:
     List of dictionary.
   """
   run_metrics_list = []
   run_output_verifier = _RunOutputVerifier(
       cache_validation_result, benchmark_setup)
   cached_encoded_data_lengths = (
       cache_validation_result['effective_encoded_data_lengths'])
   for repeat_id, repeat_dir in sandwich_runner.WalkRepeatedRuns(
       runner_output_dir):
     trace_path = os.path.join(repeat_dir, sandwich_runner.TRACE_FILENAME)

     logging.info('loading trace: %s', trace_path)
     trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)

     logging.info('verifying trace: %s', trace_path)
     run_output_verifier.VerifyTrace(trace)

     logging.info('extracting metrics from trace: %s', trace_path)
     served_from_network_bytes = 0
     served_from_cache_bytes = 0
     urls_hitting_network = set()
     for request in _FilterOutDataAndIncompleteRequests(
         trace.request_track.GetEvents()):
       # Ignore requests served from the blink's cache.
       if request.served_from_cache:
         continue
       urls_hitting_network.add(request.url)
       if request.from_disk_cache:
         served_from_cache_bytes += cached_encoded_data_lengths[request.url]
       else:
         served_from_network_bytes += request.GetEncodedDataLength()

     # Make sure the served from blink's cache requests have at least one
     # corresponding request that was not served from the blink's cache.
     for request in _FilterOutDataAndIncompleteRequests(
         trace.request_track.GetEvents()):
       assert (request.url in urls_hitting_network or
               not request.served_from_cache)

     run_metrics = {
         'url': trace.url,
         'repeat_id': repeat_id,
         'subresource_discoverer': benchmark_setup['subresource_discoverer'],
         'cache_recording.subresource_count':
             len(cache_validation_result['effective_encoded_data_lengths']),
         'cache_recording.cached_subresource_count_theoretic':
             len(cache_validation_result['successfully_cached_resources']),
         'cache_recording.cached_subresource_count':
             len(cache_validation_result['expected_cached_resources']),
         'benchmark.subresource_count': len(_ListUrlRequests(
             trace, _RequestOutcome.All)),
         'benchmark.served_from_cache_count_theoretic':
             len(benchmark_setup['cache_whitelist']),
         'benchmark.served_from_cache_count': len(_ListUrlRequests(
             trace, _RequestOutcome.ServedFromCache)),
         'benchmark.served_from_network_bytes': served_from_network_bytes,
         'benchmark.served_from_cache_bytes': served_from_cache_bytes
     }
     run_metrics.update(
         sandwich_metrics.ExtractCommonMetricsFromRepeatDirectory(
             repeat_dir, trace))
     run_metrics_list.append(run_metrics)
   run_metrics_list.sort(key=lambda e: e['repeat_id'])

   wpr_log_path = os.path.join(
       runner_output_dir, sandwich_runner.WPR_LOG_FILENAME)
   logging.info('verifying wpr log: %s', wpr_log_path)
   run_output_verifier.VerifyWprLog(wpr_log_path)
   return run_metrics_list


 class PrefetchBenchmarkBuilder(task_manager.Builder):
   """A builder for a graph of tasks for NoState-Prefetch emulated benchmarks."""

   def __init__(self, common_builder):
     task_manager.Builder.__init__(self,
                                   common_builder.output_directory,
                                   common_builder.output_subdirectory)
     self._common_builder = common_builder

     self._original_headers_path = None
     self._wpr_archive_path = None
     self._cache_path = None
     self._trace_from_grabbing_reference_cache = None
     self._cache_validation_task = None
     self._PopulateCommonPipelines()

   def _PopulateCommonPipelines(self):
     """Creates necessary tasks to produce initial cache archive.

     Also creates a task for producing a json file with a mapping of URLs to
     subresources (urls-resources.json).

     Here is the full dependency tree for the returned task:
     common/patched-cache-validation.json
       depends on: common/patched-cache.zip
         depends on: common/original-cache.zip
           depends on: common/webpages-patched.wpr
             depends on: common/webpages.wpr
     """
     self._original_headers_path = self.RebaseOutputPath(
         'common/response-headers.json')

     @self.RegisterTask('common/webpages-patched.wpr',
                        dependencies=[self._common_builder.original_wpr_task])
     def BuildPatchedWpr():
       common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path)
       shutil.copyfile(
           self._common_builder.original_wpr_task.path, BuildPatchedWpr.path)
       wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path)

       # Save up original response headers.
       original_response_headers = {e.url: e.GetResponseHeadersDict() \
           for e in wpr_archive.ListUrlEntries()}
       with open(self._original_headers_path, 'w') as file_output:
         json.dump(original_response_headers, file_output)

       # Patch WPR.
       _PatchWpr(wpr_archive)
       wpr_archive.Persist()

     @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr])
     def BuildOriginalCache():
       runner = self._common_builder.CreateSandwichRunner()
       runner.wpr_archive_path = BuildPatchedWpr.path
       runner.cache_archive_path = BuildOriginalCache.path
       runner.cache_operation = sandwich_runner.CacheOperation.SAVE
       runner.output_dir = BuildOriginalCache.run_path
       runner.Run()
     BuildOriginalCache.run_path = BuildOriginalCache.path[:-4] + '-run'
     original_cache_trace_path = os.path.join(
         BuildOriginalCache.run_path, '0', sandwich_runner.TRACE_FILENAME)

     @self.RegisterTask('common/patched-cache.zip', [BuildOriginalCache])
     def BuildPatchedCache():
       _PatchCacheArchive(BuildOriginalCache.path,
           original_cache_trace_path, BuildPatchedCache.path)

     @self.RegisterTask('common/patched-cache-validation.json',
                        [BuildPatchedCache])
     def ValidatePatchedCache():
       cache_validation_result = _ValidateCacheArchiveContent(
           original_cache_trace_path, BuildPatchedCache.path)
       with open(ValidatePatchedCache.path, 'w') as output:
         json.dump(cache_validation_result, output)

     self._wpr_archive_path = BuildPatchedWpr.path
     self._trace_from_grabbing_reference_cache = original_cache_trace_path
     self._cache_path = BuildPatchedCache.path
     self._cache_validation_task = ValidatePatchedCache

     self._common_builder.default_final_tasks.append(ValidatePatchedCache)

   def PopulateLoadBenchmark(self, subresource_discoverer,
                             transformer_list_name, transformer_list):
     """Populate benchmarking tasks from its setup tasks.

     Args:
       subresource_discoverer: Name of a subresources discoverer.
       transformer_list_name: A string describing the transformers, will be used
           in Task names (prefer names without spaces and special characters).
       transformer_list: An ordered list of function that takes an instance of
           SandwichRunner as parameter, would be applied immediately before
           SandwichRunner.Run() in the given order.

     Here is the full dependency of the added tree for the returned task:
     <transformer_list_name>/<subresource_discoverer>-metrics.csv
       depends on: <transformer_list_name>/<subresource_discoverer>-run/
         depends on: common/<subresource_discoverer>-cache.zip
           depends on: common/<subresource_discoverer>-setup.json
             depends on: common/patched-cache-validation.json
     """
     additional_column_names = [
         'url',
         'repeat_id',
         'subresource_discoverer',
         'cache_recording.subresource_count',
         'cache_recording.cached_subresource_count_theoretic',
         'cache_recording.cached_subresource_count',
         'benchmark.subresource_count',
         'benchmark.served_from_cache_count_theoretic',
         'benchmark.served_from_cache_count',
         'benchmark.served_from_network_bytes',
         'benchmark.served_from_cache_bytes']

     assert subresource_discoverer in SUBRESOURCE_DISCOVERERS
     assert 'common' not in SUBRESOURCE_DISCOVERERS
     shared_task_prefix = os.path.join('common', subresource_discoverer)
     task_prefix = os.path.join(transformer_list_name, subresource_discoverer)

     @self.RegisterTask(shared_task_prefix + '-setup.json', merge=True,
                        dependencies=[self._cache_validation_task])
     def SetupBenchmark():
       whitelisted_urls = _ExtractDiscoverableUrls(
           original_headers_path=self._original_headers_path,
           loading_trace_path=self._trace_from_grabbing_reference_cache,
           subresource_discoverer=subresource_discoverer)

       common_util.EnsureParentDirectoryExists(SetupBenchmark.path)
       with open(SetupBenchmark.path, 'w') as output:
         json.dump({
             'cache_whitelist': [url for url in whitelisted_urls],
             'subresource_discoverer': subresource_discoverer,
           }, output)

     @self.RegisterTask(shared_task_prefix + '-cache.zip', merge=True,
                        dependencies=[SetupBenchmark])
     def BuildBenchmarkCacheArchive():
       benchmark_setup = json.load(open(SetupBenchmark.path))
       chrome_cache.ApplyUrlWhitelistToCacheArchive(
           cache_archive_path=self._cache_path,
           whitelisted_urls=benchmark_setup['cache_whitelist'],
           output_cache_archive_path=BuildBenchmarkCacheArchive.path)

     @self.RegisterTask(task_prefix + '-run/',
                        dependencies=[BuildBenchmarkCacheArchive])
     def RunBenchmark():
       runner = self._common_builder.CreateSandwichRunner()
       for transformer in transformer_list:
         transformer(runner)
       runner.wpr_archive_path = self._wpr_archive_path
       runner.wpr_out_log_path = os.path.join(
           RunBenchmark.path, sandwich_runner.WPR_LOG_FILENAME)
       runner.cache_archive_path = BuildBenchmarkCacheArchive.path
       runner.cache_operation = sandwich_runner.CacheOperation.PUSH
       runner.output_dir = RunBenchmark.path
       runner.Run()

     @self.RegisterTask(task_prefix + '-metrics.csv',
                        dependencies=[RunBenchmark])
     def ProcessRunOutputDir():
       benchmark_setup = json.load(open(SetupBenchmark.path))
       cache_validation_result = json.load(
           open(self._cache_validation_task.path))

       run_metrics_list = _ProcessRunOutputDir(
           cache_validation_result, benchmark_setup, RunBenchmark.path)
       with open(ProcessRunOutputDir.path, 'w') as csv_file:
         writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names +
                                     sandwich_metrics.COMMON_CSV_COLUMN_NAMES))
         writer.writeheader()
         for trace_metrics in run_metrics_list:
           writer.writerow(trace_metrics)

     self._common_builder.default_final_tasks.append(ProcessRunOutputDir)