7 files changed, 1980 insertions, 709 deletions
diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
index 8aa29443..bc3bce32 100644
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,250 @@
+2020-03-29  Mark Wielaard  <[email protected]>
+
+	* debuginfod-client.c (debuginfod_add_http_header): Check header
+	contains precisely one colon that isn't the first or last char.
+
+2020-03-29  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-client.c (struct debuginfod_client): Add a flag field
+	for progressfn printing.
+	(default_progressfn): Set it if printing \rsomething.
+	(debuginfod_end): Terminate with \n if flag set, i.e., only if the
+	default_progressfn was actually called.
+
+2020-03-27  Mark Wielaard  <[email protected]>
+
+	* debuginfod.cxx (parse_opt): Check port is not zero.
+
+2020-03-28  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (handle_buildid_r_match): During archive
+	extraction / fdcache prefetching, set the mtime of each
+	file in the cache.
+
+2020-03-27  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-find.c (main): Extract buildid from /binary/ if
+	given instead of hex string.
+	* Makefile.am: Add elfutils library prereqs for debuginfod-find.
+
+2020-03-24  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.h, libdebuginfod.map: New functions for _add_url_header.
+	* debuginfod-client.c (struct debuginfod_client): Add headers fields.
+	(debuginfod_add_http_header): New client api to add outgoing headers.
+	(add_default_headers): Renamed from add_extra_headers, skip if flag.
+	(debuginfod_query_server): Pass accumulated headers to libcurl.
+	(debuginfod_end): Clean accumulated headers.
+	(debuginfod_find_*): Add default headers at this point.
+	* debuginfod.cxx (handle_buildid): Add conn pointer.  Use it to relay
+	incoming UA and XFF headers to federated upstream debuginfods.
+
+2020-03-26  Frank Ch. Eigler <[email protected]>
+
+	* debuginfod.cxx (handler_cb): Export two families of metrics for
+	prometheus traffic analysis: response times and data amounts.
+
+2020-03-26  Frank Ch. Eigler <[email protected]>
+
+	* debuginfod.cxx (parse_opt): For -U, prefer dpkg-deb
+	after all if access(3)-able, fallback to bsdtar.
+
+2020-03-25  Frank Ch. Eigler <[email protected]>
+
+	* debuginfod.cxx (parse_opt): Associate a bsdtar subshell with
+	the .deb & .ddeb extensions, instead of dpkg-deb.
+
+2020-03-26  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-client.c (debuginfod_query_server): Don't
+	set CURLOPT_PATH_AS_IS on old curl.  Mostly harmless.
+
+2020-03-24  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-client.c (debuginfod_query_server): Set
+	CURLOPT_PATH_AS_IS, to propagate file names verbatim.
+	* debuginfod.cxx (canon_pathname): Implement RFC3986
+	style pathname canonicalization.
+	(handle_buildid): Canonicalize incoming webapi source
+	paths, accept either one.
+	(scan_source_file, archive_classify): Store both
+	original and canonicalized dwarf-source file names.
+
+2020-03-24  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (handle_buildid): In case of federated fallback
+	queries, handle errors analogously to local ENOENT/404.
+	(handle_metrics): Return a size-of-response value.
+	(handler_cb): Add code to time entire application-side processing
+	stage + response sizes + http codes, so as to emit a complete
+	httpd-flavoured log line for each webapi request.
+
+2020-03-24  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-client.c (debuginfod_query_server): Print the
+	default_progressfn terminating \n message only if that progressfn
+	is actually set.
+
+2020-03-24  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-find.c (main): Correct /source full-pathness check for
+	"debuginfod-find -v source deadbeef /pathname" case.
+
+2020-03-22  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-client.c (struct debuginfod_client): Add url field.
+	(struct handle_data): Add client field as backpointer.
+	(debuginfod_write_callback): Compute & save URL.
+	(default_progressfn): Print front pieces of the URL.
+	(debuginfod_query_server): Clear URL and cleanup after progressfn.
+	* debuginfod-find.c (main): Print URL at transfer conclusion.
+
+2020-03-22  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.h, libdebuginfod.map: New functions for _get/set_user().
+	* debuginfod-client.c: Implement them.
+	* debuginfod-find.c: Include a token call just for testing them.
+
+2020-03-03  Aaron Merey  <[email protected]>
+
+	* debuginfod-client.c (debuginfod_query_server): Update
+	cache_path even when new default path already exists.
+
+2020-02-27  Aaron Merey  <[email protected]>
+
+	* debuginfod-client.c (xalloc_str): New macro. Call
+	asprintf with error checking.
+	(debuginfod_query_server): Use XDG_CACHE_HOME as a default
+	cache location if it is set. Replace snprintf with xalloc_str.
+
+2020-02-26  Konrad Kleine <[email protected]>
+
+	* debuginfod-client.c (debuginfod_query_server): Handle curl's
+	response code correctly when DEBUGINFOD_URLS begin with file://
+
+2020-02-25  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (parse_opt): Treat -R as if -Z.rpm .
+
+2020-02-25  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (fdcache_prefetch): New parameter.
+	(parse_opt): Parse it.
+	(main): Default it.
+	(fdcache::fd_size_mb): Change to double for accuracy.
+	(fdcache::probe): New function.
+	(fdcache::intern): New option to intern at end of LRU.
+	(fdcache::lookup): Clean fdcache.
+	(handle_buildid_r_match): Implement multi-stage archive
+	parsing, with optional prefetching of extracted contents
+	into the fdcache.
+
+2020-02-19  Aaron Merey  <[email protected]>
+
+	* debuginfod-client.c (debuginfod_clean_cache): Restrict
+	cleanup to client-pattern files.
+
+2020-02-05  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (argp options): Add -Z option.
+	(canonicalized_archive_entry_pathname): New function for
+	distro-agnostic file name matching/storage.
+
+2020-01-22  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (dwarf_extract_source_paths): Don't print
+	"skipping hat" messages at verbosity <=3, too noisy.
+
+2020-01-19  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (scanq): Rework to let groomer/fts threads
+	synchronize with an empty workqueue, and lock out workqueue
+	consumers.
+	(thread_groom): Adopt new scanq idle APIs to lock out scanners.
+	(thread_main_fts_source_paths): Adopt new scanq idler API to
+	avoid being restarted while scanners haven't even finished yet.
+	(thread_main_*): Increment thread_work_total metric only after
+	a work cycle is completed, not when it begins.
+
+2020-01-18  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (thread_main_scanner): Handle empty source_paths[].
+
+2020-01-11  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (libarchive_fdcache): New class/facility to own a
+	cache of temporary files that were previously extracted from an
+	archive.  If only it could store just unlinked fd's instead of
+	filenames.
+	(handle_buildid_r_match): Use it to answer dwz/altdebug and webapi
+	requests.
+	(groom): Clean it.
+	(main): Initialize the cache control parameters from heuristics.
+	Use a consistent tmpdir for these and tmp files elsewhere.
+
+2020-01-11  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (conninfo): Print User-Agent and X-Forwarded-For
+	request headers, after mild safety-censorship (for easier machine
+	processing).
+
+2020-01-11  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx: Rework threading model.
+	(workq): New class for concurrent work-queue.
+	(semaphore): Removed class, now unused.
+	(scan_source_file_path): Rework into ...
+	(scan_source_file): New function.
+	(thread_main_scan_source_file_path): Nuke.
+	(scan_source_archive_path): Rework into ...
+	(scan_archive_file): New function.
+	(thread_main_scanner): New function for scanner threads.
+	(thread_main_fts_source_paths): New function for traversal thread.
+	(scan_source_paths): ... doing this.
+	(thread_groom): Tweak metrics for consistency.
+	(main): Start 1 traversal and N scanner threads if needed.
+
+2019-01-02  Mark Wielaard  <[email protected]>
+
+	* debuginfod.cxx (default_connect_timeout): Removed.
+	(default_transfer_timeout): Removed.
+	(default_timeout): New. Default to 90 seconds.
+	(debuginfod_query_server): Parse server_timeout_envvar as one number.
+	Set as CURLOPT_LOW_SPEED_TIME, with CURL_OPT_LOW_SPEED_LIMITE as 100K.
+
+2020-01-09  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-client.c (add_extra_headers): New function,
+	based on mjw's draft.
+	(debuginfod_query_server): Call it.
+
+2019-12-22  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod.cxx (*_rpm_*): Rename to *_archive_* throughout.
+	(scan_archives): New read-mostly global to identify archive
+	file extensions and corresponding extractor commands.
+	(parse_opt): Handle new -U flag.
+
+2019-12-19  Frank Ch. Eigler  <[email protected]>
+
+	* debuginfod-client.c (default_progressfn): New function.
+	(debuginfod_begin): Use it if $DEBUGINFOD_PROGRESS set.
+	(server_timeout): Bump to 30 seconds.
+	(debuginfod_query_server): Call progressfn -after- rather than
+	before curl ops, to make it likely that a successful transfer
+	results in final a=b call.  Tweak cleanup sequence.
+	* debuginfod.h: Document $DEBUGINFOD_PROGRESS name.
+
+2019-12-09  Mark Wielaard  <[email protected]>
+
+	* debuginfod-client.c (debuginfod_query_server): Check
+	server_urls_envvar early.
+
+2019-12-03  Mark Wielaard  <[email protected]>
+
+	* debuginfod-client.c (debuginfod_query_server): Use separate
+	local variables for CURLcode curl_res and CURLMcode curlm_res.
+
 2019-11-26  Mark Wielaard  <[email protected]>
 
 	* Makefile.am (BUILD_STATIC): Add needed libraries for libdw and
diff --git a/debuginfod/Makefile.am b/debuginfod/Makefile.am
index 7ae74e06..47b6e431 100644
--- a/debuginfod/Makefile.am
+++ b/debuginfod/Makefile.am
@@ -62,7 +62,7 @@ debuginfod_SOURCES = debuginfod.cxx
 debuginfod_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod) $(libmicrohttpd_LIBS) $(libcurl_LIBS) $(sqlite3_LIBS) $(libarchive_LIBS) -lpthread -ldl
 
 debuginfod_find_SOURCES = debuginfod-find.c
-debuginfod_find_LDADD = $(libeu) $(libdebuginfod)
+debuginfod_find_LDADD = $(libdw) $(libelf) $(libeu) $(libdebuginfod)
 
 lib_LIBRARIES = libdebuginfod.a
 noinst_LIBRARIES = libdebuginfod_pic.a
diff --git a/debuginfod/debuginfod-client.c b/debuginfod/debuginfod-client.c
index 6e62b86c..a7dfbfb1 100644
--- a/debuginfod/debuginfod-client.c
+++ b/debuginfod/debuginfod-client.c
@@ -1,5 +1,5 @@
 /* Retrieve ELF / DWARF / source files from the debuginfod.
-   Copyright (C) 2019 Red Hat, Inc.
+   Copyright (C) 2019-2020 Red Hat, Inc.
    This file is part of elfutils.
 
    This file is free software; you can redistribute it and/or modify
@@ -40,6 +40,7 @@
 
 #include "config.h"
 #include "debuginfod.h"
+#include "system.h"
 #include <assert.h>
 #include <dirent.h>
 #include <stdio.h>
@@ -49,6 +50,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <fts.h>
+#include <regex.h>
 #include <string.h>
 #include <stdbool.h>
 #include <linux/limits.h>
@@ -57,6 +59,7 @@
 #include <sys/syscall.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/utsname.h>
 #include <curl/curl.h>
 
 /* If fts.h is included before config.h, its indirect inclusions may not
@@ -76,6 +79,20 @@ struct debuginfod_client
   /* Progress/interrupt callback function. */
   debuginfod_progressfn_t progressfn;
 
+  /* Stores user data. */
+  void* user_data;
+
+  /* Stores current/last url, if any. */
+  char* url;
+
+  /* Accumulates outgoing http header names/values. */
+  int user_agent_set_p; /* affects add_default_headers */
+  struct curl_slist *headers;
+
+  /* Flags the default_progressfn having printed something that
+     debuginfod_end needs to terminate. */
+  int default_progressfn_printed_p;
+
   /* Can contain all other context, like cache_path, server_urls,
      timeout or other info gotten from environment variables, the
      handle data, etc. So those don't have to be reparsed and
@@ -96,18 +113,18 @@ static const time_t cache_default_max_unused_age_s = 604800; /* 1 week */
 /* Location of the cache of files downloaded from debuginfods.
    The default parent directory is $HOME, or '/' if $HOME doesn't exist.  */
 static const char *cache_default_name = ".debuginfod_client_cache";
+static const char *cache_xdg_name = "debuginfod_client";
 static const char *cache_path_envvar = DEBUGINFOD_CACHE_PATH_ENV_VAR;
 
-/* URLs of debuginfods, separated by url_delim.
-   This env var must be set for debuginfod-client to run.  */
+/* URLs of debuginfods, separated by url_delim. */
 static const char *server_urls_envvar = DEBUGINFOD_URLS_ENV_VAR;
 static const char *url_delim =  " ";
 static const char url_delim_char = ' ';
 
-/* Timeout for debuginfods, in seconds.
-   This env var must be set for debuginfod-client to run.  */
+/* Timeout for debuginfods, in seconds (to get at least 100K). */
 static const char *server_timeout_envvar = DEBUGINFOD_TIMEOUT_ENV_VAR;
-static int server_timeout = 5;
+static const long default_timeout = 90;
+
 
 /* Data associated with a particular CURL easy handle. Passed to
    the write callback.  */
@@ -122,6 +139,9 @@ struct handle_data
   /* This handle.  */
   CURL *handle;
 
+  /* The client object whom we're serving. */
+  debuginfod_client *client;
+
   /* Pointer to handle that should write to fd. Initially points to NULL,
      then points to the first handle that begins writing the target file
      to the cache. Used to ensure that a file is not downloaded from
@@ -138,7 +158,17 @@ debuginfod_write_callback (char *ptr, size_t size, size_t nmemb, void *data)
 
   /* Indicate to other handles that they can abort their transfer.  */
   if (*d->target_handle == NULL)
-    *d->target_handle = d->handle;
+    {
+      *d->target_handle = d->handle;
+      /* update the client object */
+      const char *url = NULL;
+      (void) curl_easy_getinfo (d->handle, CURLINFO_EFFECTIVE_URL, &url);
+      if (url)
+        {
+          free (d->client->url);
+          d->client->url = strdup(url); /* ok if fails */
+        }
+    }
 
   /* If this handle isn't the target handle, abort transfer.  */
   if (*d->target_handle != d->handle)
@@ -240,10 +270,19 @@ debuginfod_clean_cache(debuginfod_client *c,
   if (fts == NULL)
     return -errno;
 
+  regex_t re;
+  const char * pattern = ".*/[a-f0-9]+/(debuginfo|executable|source.*)$";
+  if (regcomp (&re, pattern, REG_EXTENDED | REG_NOSUB) != 0)
+    return -ENOMEM;
+
   FTSENT *f;
   long files = 0;
   while ((f = fts_read(fts)) != NULL)
     {
+      /* ignore any files that do not match the pattern.  */
+      if (regexec (&re, f->fts_path, 0, NULL, 0) != 0)
+        continue;
+
       files++;
       if (c->progressfn) /* inform/check progress callback */
         if ((c->progressfn) (c, files, 0))
@@ -267,7 +306,8 @@ debuginfod_clean_cache(debuginfod_client *c,
           ;
         }
     }
-  fts_close(fts);
+  fts_close (fts);
+  regfree (&re);
 
   /* Update timestamp representing when the cache was last cleaned.  */
   utime (interval_path, NULL);
@@ -278,6 +318,136 @@ debuginfod_clean_cache(debuginfod_client *c,
 #define MAX_BUILD_ID_BYTES 64
 
 
+static void
+add_default_headers(debuginfod_client *client)
+{
+  if (client->user_agent_set_p)
+    return;
+
+  /* Compute a User-Agent: string to send.  The more accurately this
+     describes this host, the likelier that the debuginfod servers
+     might be able to locate debuginfo for us. */
+
+  char* utspart = NULL;
+  struct utsname uts;
+  int rc = 0;
+  rc = uname (&uts);
+  if (rc == 0)
+    rc = asprintf(& utspart, "%s/%s", uts.sysname, uts.machine);
+  if (rc < 0)
+    utspart = NULL;
+
+  FILE *f = fopen ("/etc/os-release", "r");
+  if (f == NULL)
+    f = fopen ("/usr/lib/os-release", "r");
+  char *id = NULL;
+  char *version = NULL;
+  if (f != NULL)
+    {
+      while (id == NULL || version == NULL)
+        {
+          char buf[128];
+          char *s = &buf[0];
+          if (fgets (s, sizeof(buf), f) == NULL)
+            break;
+
+          int len = strlen (s);
+          if (len < 3)
+            continue;
+          if (s[len - 1] == '\n')
+            {
+              s[len - 1] = '\0';
+              len--;
+            }
+
+          char *v = strchr (s, '=');
+          if (v == NULL || strlen (v) < 2)
+            continue;
+
+          /* Split var and value. */
+          *v = '\0';
+          v++;
+
+          /* Remove optional quotes around value string. */
+          if (*v == '"' || *v == '\'')
+            {
+              v++;
+              s[len - 1] = '\0';
+            }
+          if (strcmp (s, "ID") == 0)
+            id = strdup (v);
+          if (strcmp (s, "VERSION_ID") == 0)
+            version = strdup (v);
+        }
+      fclose (f);
+    }
+
+  char *ua = NULL;
+  rc = asprintf(& ua, "User-Agent: %s/%s,%s,%s/%s",
+                PACKAGE_NAME, PACKAGE_VERSION,
+                utspart ?: "",
+                id ?: "",
+                version ?: "");
+  if (rc < 0)
+    ua = NULL;
+
+  if (ua)
+    (void) debuginfod_add_http_header (client, ua);
+
+  free (ua);
+  free (id);
+  free (version);
+  free (utspart);
+}
+
+
+#define xalloc_str(p, fmt, args...)        \
+  do                                       \
+    {                                      \
+      if (asprintf (&p, fmt, args) < 0)    \
+        {                                  \
+          p = NULL;                        \
+          rc = -ENOMEM;                    \
+          goto out;                        \
+        }                                  \
+    } while (0)
+
+
+/* Offer a basic form of progress tracing */
+static int
+default_progressfn (debuginfod_client *c, long a, long b)
+{
+  const char* url = debuginfod_get_url (c);
+  int len = 0;
+
+  /* We prefer to print the host part of the URL to keep the
+     message short. */
+  if (url != NULL)
+    {
+      const char* buildid = strstr(url, "buildid/");
+      if (buildid != NULL)
+        len = (buildid - url);
+      else
+        len = strlen(url);
+    }
+
+  if (b == 0 || url==NULL) /* early stage */
+    dprintf(STDERR_FILENO,
+            "\rDownloading %c", "-/|\\"[a % 4]);
+  else if (b < 0) /* download in progress but unknown total length */
+    dprintf(STDERR_FILENO,
+            "\rDownloading from %.*s %ld",
+            len, url, a);
+  else /* download in progress, and known total length */
+    dprintf(STDERR_FILENO,
+            "\rDownloading from %.*s %ld/%ld",
+            len, url, a, b);
+  c->default_progressfn_printed_p = 1;
+
+  return 0;
+}
+
+
 /* Query each of the server URLs found in $DEBUGINFOD_URLS for the file
    with the specified build-id, type (debuginfo, executable or source)
    and filename. filename may be NULL. If found, return a file
@@ -291,16 +461,32 @@ debuginfod_query_server (debuginfod_client *c,
                          const char *filename,
                          char **path)
 {
-  char *urls_envvar;
   char *server_urls;
-  char cache_path[PATH_MAX];
-  char maxage_path[PATH_MAX*3]; /* These *3 multipliers are to shut up gcc -Wformat-truncation */
-  char interval_path[PATH_MAX*4];
-  char target_cache_dir[PATH_MAX*2];
-  char target_cache_path[PATH_MAX*4];
-  char target_cache_tmppath[PATH_MAX*5];
-  char suffix[PATH_MAX*2];
+  char *urls_envvar;
+  char *cache_path = NULL;
+  char *maxage_path = NULL;
+  char *interval_path = NULL;
+  char *target_cache_dir = NULL;
+  char *target_cache_path = NULL;
+  char *target_cache_tmppath = NULL;
+  char suffix[PATH_MAX];
   char build_id_bytes[MAX_BUILD_ID_BYTES * 2 + 1];
+  int rc;
+
+  /* Clear the obsolete URL from a previous _find operation. */
+  free (c->url);
+  c->url = NULL;
+
+  add_default_headers(c);
+
+  /* Is there any server we can query?  If not, don't do any work,
+     just return with ENOSYS.  Don't even access the cache.  */
+  urls_envvar = getenv(server_urls_envvar);
+  if (urls_envvar == NULL || urls_envvar[0] == '\0')
+    {
+      rc = -ENOSYS;
+      goto out;
+    }
 
   /* Copy lowercase hex representation of build_id into buf.  */
   if ((build_id_len >= MAX_BUILD_ID_BYTES) ||
@@ -349,31 +535,75 @@ debuginfod_query_server (debuginfod_client *c,
   /* set paths needed to perform the query
 
      example format
-     cache_path:        $HOME/.debuginfod_cache
-     target_cache_dir:  $HOME/.debuginfod_cache/0123abcd
-     target_cache_path: $HOME/.debuginfod_cache/0123abcd/debuginfo
-     target_cache_path: $HOME/.debuginfod_cache/0123abcd/source#PATH#TO#SOURCE ?
+     cache_path:        $HOME/.cache
+     target_cache_dir:  $HOME/.cache/0123abcd
+     target_cache_path: $HOME/.cache/0123abcd/debuginfo
+     target_cache_path: $HOME/.cache/0123abcd/source#PATH#TO#SOURCE ?
+
+     $XDG_CACHE_HOME takes priority over $HOME/.cache.
+     $DEBUGINFOD_CACHE_PATH takes priority over $HOME/.cache and $XDG_CACHE_HOME.
   */
 
-  if (getenv(cache_path_envvar))
-    strcpy(cache_path, getenv(cache_path_envvar));
+  /* Determine location of the cache. The path specified by the debuginfod
+     cache environment variable takes priority.  */
+  char *cache_var = getenv(cache_path_envvar);
+  if (cache_var != NULL && strlen (cache_var) > 0)
+    xalloc_str (cache_path, "%s", cache_var);
   else
     {
-      if (getenv("HOME"))
-        sprintf(cache_path, "%s/%s", getenv("HOME"), cache_default_name);
-      else
-        sprintf(cache_path, "/%s", cache_default_name);
+      /* If a cache already exists in $HOME ('/' if $HOME isn't set), then use
+         that. Otherwise use the XDG cache directory naming format.  */
+      xalloc_str (cache_path, "%s/%s", getenv ("HOME") ?: "/", cache_default_name);
+
+      struct stat st;
+      if (stat (cache_path, &st) < 0)
+        {
+          char cachedir[PATH_MAX];
+          char *xdg = getenv ("XDG_CACHE_HOME");
+
+          if (xdg != NULL && strlen (xdg) > 0)
+            snprintf (cachedir, PATH_MAX, "%s", xdg);
+          else
+            snprintf (cachedir, PATH_MAX, "%s/.cache", getenv ("HOME") ?: "/");
+
+          /* Create XDG cache directory if it doesn't exist.  */
+          if (stat (cachedir, &st) == 0)
+            {
+              if (! S_ISDIR (st.st_mode))
+                {
+                  rc = -EEXIST;
+                  goto out;
+                }
+            }
+          else
+            {
+              rc = mkdir (cachedir, 0700);
+
+              /* Also check for EEXIST and S_ISDIR in case another client just
+                 happened to create the cache.  */
+              if (rc < 0
+                  && (errno != EEXIST
+                      || stat (cachedir, &st) != 0
+                      || ! S_ISDIR (st.st_mode)))
+                {
+                  rc = -errno;
+                  goto out;
+                }
+            }
+
+          free (cache_path);
+          xalloc_str (cache_path, "%s/%s", cachedir, cache_xdg_name);
+        }
     }
 
-  /* avoid using snprintf here due to compiler warning.  */
-  snprintf(target_cache_dir, sizeof(target_cache_dir), "%s/%s", cache_path, build_id_bytes);
-  snprintf(target_cache_path, sizeof(target_cache_path), "%s/%s%s", target_cache_dir, type, suffix);
-  snprintf(target_cache_tmppath, sizeof(target_cache_tmppath), "%s.XXXXXX", target_cache_path);
+  xalloc_str (target_cache_dir, "%s/%s", cache_path, build_id_bytes);
+  xalloc_str (target_cache_path, "%s/%s%s", target_cache_dir, type, suffix);
+  xalloc_str (target_cache_tmppath, "%s.XXXXXX", target_cache_path);
 
   /* XXX combine these */
-  snprintf(interval_path, sizeof(interval_path), "%s/%s", cache_path, cache_clean_interval_filename);
-  snprintf(maxage_path, sizeof(maxage_path), "%s/%s", cache_path, cache_max_unused_age_filename);
-  int rc = debuginfod_init_cache(cache_path, interval_path, maxage_path);
+  xalloc_str (interval_path, "%s/%s", cache_path, cache_clean_interval_filename);
+  xalloc_str (maxage_path, "%s/%s", cache_path, cache_max_unused_age_filename);
+  rc = debuginfod_init_cache(cache_path, interval_path, maxage_path);
   if (rc != 0)
     goto out;
   rc = debuginfod_clean_cache(c, cache_path, interval_path, maxage_path);
@@ -387,19 +617,14 @@ debuginfod_query_server (debuginfod_client *c,
       /* Success!!!! */
       if (path != NULL)
         *path = strdup(target_cache_path);
-      return fd;
-    }
-
-
-  urls_envvar = getenv(server_urls_envvar);
-  if (urls_envvar == NULL || urls_envvar[0] == '\0')
-    {
-      rc = -ENOSYS;
+      rc = fd;
       goto out;
     }
 
-  if (getenv(server_timeout_envvar))
-    server_timeout = atoi (getenv(server_timeout_envvar));
+  long timeout = default_timeout;
+  const char* timeout_envvar = getenv(server_timeout_envvar);
+  if (timeout_envvar != NULL)
+    timeout = atoi (timeout_envvar);
 
   /* make a copy of the envvar so it can be safely modified.  */
   server_urls = strdup(urls_envvar);
@@ -464,6 +689,7 @@ debuginfod_query_server (debuginfod_client *c,
       data[i].fd = fd;
       data[i].target_handle = &target_handle;
       data[i].handle = curl_easy_init();
+      data[i].client = c;
 
       if (data[i].handle == NULL)
         {
@@ -491,14 +717,28 @@ debuginfod_query_server (debuginfod_client *c,
                        CURLOPT_WRITEFUNCTION,
                        debuginfod_write_callback);
       curl_easy_setopt(data[i].handle, CURLOPT_WRITEDATA, (void*)&data[i]);
-      curl_easy_setopt(data[i].handle, CURLOPT_TIMEOUT, (long) server_timeout);
+      if (timeout > 0)
+	{
+	  /* Make sure there is at least some progress,
+	     try to get at least 100K per timeout seconds.  */
+	  curl_easy_setopt (data[i].handle, CURLOPT_LOW_SPEED_TIME,
+			    timeout);
+	  curl_easy_setopt (data[i].handle, CURLOPT_LOW_SPEED_LIMIT,
+			    100 * 1024L);
+	}
       curl_easy_setopt(data[i].handle, CURLOPT_FILETIME, (long) 1);
       curl_easy_setopt(data[i].handle, CURLOPT_FOLLOWLOCATION, (long) 1);
       curl_easy_setopt(data[i].handle, CURLOPT_FAILONERROR, (long) 1);
       curl_easy_setopt(data[i].handle, CURLOPT_NOSIGNAL, (long) 1);
+#if LIBCURL_VERSION_NUM >= 0x072a00 /* 7.42.0 */
+      curl_easy_setopt(data[i].handle, CURLOPT_PATH_AS_IS, (long) 1);
+#else
+      /* On old curl; no big deal, canonicalization here is almost the
+         same, except perhaps for ? # type decorations at the tail. */
+#endif
       curl_easy_setopt(data[i].handle, CURLOPT_AUTOREFERER, (long) 1);
       curl_easy_setopt(data[i].handle, CURLOPT_ACCEPT_ENCODING, "");
-      curl_easy_setopt(data[i].handle, CURLOPT_USERAGENT, (void*) PACKAGE_STRING);
+      curl_easy_setopt(data[i].handle, CURLOPT_HTTPHEADER, c->headers);
 
       curl_multi_add_handle(curlm, data[i].handle);
       server_url = strtok_r(NULL, url_delim, &strtok_saveptr);
@@ -509,15 +749,35 @@ debuginfod_query_server (debuginfod_client *c,
   long loops = 0;
   do
     {
-      CURLMcode curl_res;
+      /* Wait 1 second, the minimum DEBUGINFOD_TIMEOUT.  */
+      curl_multi_wait(curlm, NULL, 0, 1000, NULL);
+
+      /* If the target file has been found, abort the other queries.  */
+      if (target_handle != NULL)
+        for (int i = 0; i < num_urls; i++)
+          if (data[i].handle != target_handle)
+            curl_multi_remove_handle(curlm, data[i].handle);
+
+      CURLMcode curlm_res = curl_multi_perform(curlm, &still_running);
+      if (curlm_res != CURLM_OK)
+        {
+          switch (curlm_res)
+            {
+            case CURLM_CALL_MULTI_PERFORM: continue;
+            case CURLM_OUT_OF_MEMORY: rc = -ENOMEM; break;
+            default: rc = -ENETUNREACH; break;
+            }
+          goto out1;
+        }
 
       if (c->progressfn) /* inform/check progress callback */
         {
           loops ++;
           long pa = loops; /* default params for progress callback */
-          long pb = 0;
+          long pb = 0; /* transfer_timeout tempting, but loops != elapsed-time */
           if (target_handle) /* we've committed to a server; report its download progress */
             {
+              CURLcode curl_res;
 #ifdef CURLINFO_SIZE_DOWNLOAD_T
               curl_off_t dl;
               curl_res = curl_easy_getinfo(target_handle,
@@ -534,6 +794,8 @@ debuginfod_query_server (debuginfod_client *c,
                 pa = (dl > LONG_MAX ? LONG_MAX : (long)dl);
 #endif
 
+              /* NB: If going through deflate-compressing proxies, this
+                 number is likely to be unavailable, so -1 may show. */
 #ifdef CURLINFO_CURLINFO_CONTENT_LENGTH_DOWNLOAD_T
               curl_off_t cl;
               curl_res = curl_easy_getinfo(target_handle,
@@ -554,27 +816,6 @@ debuginfod_query_server (debuginfod_client *c,
           if ((*c->progressfn) (c, pa, pb))
             break;
         }
-
-      /* Wait 1 second, the minimum DEBUGINFOD_TIMEOUT.  */
-      curl_multi_wait(curlm, NULL, 0, 1000, NULL);
-
-      /* If the target file has been found, abort the other queries.  */
-      if (target_handle != NULL)
-        for (int i = 0; i < num_urls; i++)
-          if (data[i].handle != target_handle)
-            curl_multi_remove_handle(curlm, data[i].handle);
-
-      curl_res = curl_multi_perform(curlm, &still_running);
-      if (curl_res != CURLM_OK)
-        {
-          switch (curl_res)
-            {
-            case CURLM_CALL_MULTI_PERFORM: continue;
-            case CURLM_OUT_OF_MEMORY: rc = -ENOMEM; break;
-            default: rc = -ENETUNREACH; break;
-            }
-          goto out1;
-        }
     } while (still_running);
 
   /* Check whether a query was successful. If so, assign its handle
@@ -610,20 +851,34 @@ debuginfod_query_server (debuginfod_client *c,
           else
             {
               /* Query completed without an error. Confirm that the
-                 response code is 200 and set verified_handle.  */
-              long resp_code = 500;
-              CURLcode curl_res;
+                 response code is 200 when using HTTP/HTTPS and 0 when
+                 using file:// and set verified_handle.  */
 
-              curl_res = curl_easy_getinfo(target_handle,
-                                           CURLINFO_RESPONSE_CODE,
-                                           &resp_code);
-
-              if (curl_res == CURLE_OK
-                  && resp_code == 200
-                  && msg->easy_handle != NULL)
+              if (msg->easy_handle != NULL)
                 {
-                  verified_handle = msg->easy_handle;
-                  break;
+                  char *effective_url = NULL;
+                  long resp_code = 500;
+                  CURLcode ok1 = curl_easy_getinfo (target_handle,
+						    CURLINFO_EFFECTIVE_URL,
+						    &effective_url);
+                  CURLcode ok2 = curl_easy_getinfo (target_handle,
+						    CURLINFO_RESPONSE_CODE,
+						    &resp_code);
+                  if(ok1 == CURLE_OK && ok2 == CURLE_OK && effective_url)
+                    {
+                      if (strncmp (effective_url, "http", 4) == 0)
+                        if (resp_code == 200)
+                          {
+                            verified_handle = msg->easy_handle;
+                            break;
+                          }
+                      if (strncmp (effective_url, "file", 4) == 0)
+                        if (resp_code == 0)
+                          {
+                            verified_handle = msg->easy_handle;
+                            break;
+                          }
+                    }
                 }
             }
         }
@@ -659,12 +914,14 @@ debuginfod_query_server (debuginfod_client *c,
   curl_multi_cleanup (curlm);
   free (data);
   free (server_urls);
+
   /* don't close fd - we're returning it */
   /* don't unlink the tmppath; it's already been renamed. */
   if (path != NULL)
    *path = strdup(target_cache_path);
 
-  return fd;
+  rc = fd;
+  goto out;
 
 /* error exits */
  out1:
@@ -673,32 +930,75 @@ debuginfod_query_server (debuginfod_client *c,
 
   curl_multi_cleanup(curlm);
   unlink (target_cache_tmppath);
+  close (fd); /* before the rmdir, otherwise it'll fail */
   (void) rmdir (target_cache_dir); /* nop if not empty */
   free(data);
-  close (fd);
 
  out0:
   free (server_urls);
 
+/* general purpose exit */
  out:
+  /* Conclude the last \r status line */
+  /* Another possibility is to use the ANSI CSI n K EL "Erase in Line"
+     code.  That way, the previously printed messages would be erased,
+     and without a newline. */
+  if (c->default_progressfn_printed_p)
+    dprintf(STDERR_FILENO, "\n");
+
+  free (cache_path);
+  free (maxage_path);
+  free (interval_path);
+  free (target_cache_dir);
+  free (target_cache_path);
+  free (target_cache_tmppath);
   return rc;
 }
 
+
+
 /* See debuginfod.h  */
 debuginfod_client  *
 debuginfod_begin (void)
 {
   debuginfod_client *client;
   size_t size = sizeof (struct debuginfod_client);
-  client = (debuginfod_client *) malloc (size);
+  client = (debuginfod_client *) calloc (1, size);
   if (client != NULL)
-    client->progressfn = NULL;
+    {
+      if (getenv(DEBUGINFOD_PROGRESS_ENV_VAR))
+	client->progressfn = default_progressfn;
+    }
   return client;
 }
 
 void
+debuginfod_set_user_data(debuginfod_client *client,
+                         void *data)
+{
+  client->user_data = data;
+}
+
+void *
+debuginfod_get_user_data(debuginfod_client *client)
+{
+  return client->user_data;
+}
+
+const char *
+debuginfod_get_url(debuginfod_client *client)
+{
+  return client->url;
+}
+
+void
 debuginfod_end (debuginfod_client *client)
 {
+  if (client == NULL)
+    return;
+
+  curl_slist_free_all (client->headers);
+  free (client->url);
   free (client);
 }
 
@@ -732,6 +1032,33 @@ int debuginfod_find_source(debuginfod_client *client,
 }
 
 
+/* Add an outgoing HTTP header.  */
+int debuginfod_add_http_header (debuginfod_client *client, const char* header)
+{
+  /* Sanity check header value is of the form Header: Value.
+     It should contain exactly one colon that isn't the first or
+     last character.  */
+  char *colon = strchr (header, ':');
+  if (colon == NULL
+      || colon == header
+      || *(colon + 1) == '\0'
+      || strchr (colon + 1, ':') != NULL)
+    return -EINVAL;
+
+  struct curl_slist *temp = curl_slist_append (client->headers, header);
+  if (temp == NULL)
+    return -ENOMEM;
+
+  /* Track if User-Agent: is being set.  If so, signal not to add the
+     default one. */
+  if (strncmp (header, "User-Agent:", 11) == 0)
+    client->user_agent_set_p = 1;
+
+  client->headers = temp;
+  return 0;
+}
+
+
 void
 debuginfod_set_progressfn(debuginfod_client *client,
 			  debuginfod_progressfn_t fn)
diff --git a/debuginfod/debuginfod-find.c b/debuginfod/debuginfod-find.c
index 8bd3a3db..83a43ce4 100644
--- a/debuginfod/debuginfod-find.c
+++ b/debuginfod/debuginfod-find.c
@@ -1,6 +1,6 @@
 /* Command-line frontend for retrieving ELF / DWARF / source files
    from the debuginfod.
-   Copyright (C) 2019 Red Hat, Inc.
+   Copyright (C) 2019-2020 Red Hat, Inc.
    This file is part of elfutils.
 
    This file is free software; you can redistribute it and/or modify
@@ -25,6 +25,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include <argp.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <gelf.h>
+#include <libdwelf.h>
 
 
 /* Name and version of program.  */
@@ -39,8 +43,12 @@ static const char doc[] = N_("Request debuginfo-related content "
 
 /* Strings for arguments in help texts.  */
 static const char args_doc[] = N_("debuginfo BUILDID\n"
+                                  "debuginfo PATH\n"
                                   "executable BUILDID\n"
-                                  "source BUILDID /FILENAME");
+                                  "executable PATH\n"
+                                  "source BUILDID /FILENAME\n"
+                                  "source PATH /FILENAME\n");
+
 
 /* Definitions of arguments for argp functions.  */
 static const struct argp_option options[] =
@@ -51,6 +59,7 @@ static const struct argp_option options[] =
 
 /* debuginfod connection handle.  */
 static debuginfod_client *client;
+static int verbose;
 
 int progressfn(debuginfod_client *c __attribute__((__unused__)),
 	       long a, long b)
@@ -66,7 +75,8 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state)
   (void) state;
   switch (key)
     {
-    case 'v': debuginfod_set_progressfn (client, & progressfn); break;
+    case 'v': verbose++;
+      debuginfod_set_progressfn (client, & progressfn); break;
     default: return ARGP_ERR_UNKNOWN;
     }
   return 0;
@@ -84,6 +94,8 @@ static struct argp argp =
 int
 main(int argc, char** argv)
 {
+  elf_version (EV_CURRENT);
+
   client = debuginfod_begin ();
   if (client == NULL)
     {
@@ -91,6 +103,9 @@ main(int argc, char** argv)
       return 1;
     }
 
+  /* Exercise user data pointer, to support testing only. */
+  debuginfod_set_user_data (client, (void *)"Progress");
+
   int remaining;
   (void) argp_parse (&argp, argc, argv, ARGP_IN_ORDER|ARGP_NO_ARGS, &remaining, NULL);
 
@@ -100,29 +115,72 @@ main(int argc, char** argv)
       return 1;
     }
 
-  int rc;
+  /* If we were passed an ELF file name in the BUILDID slot, look in there. */
+  unsigned char* build_id = (unsigned char*) argv[remaining+1];
+  int build_id_len = 0; /* assume text */
+
+  int any_non_hex = 0;
+  int i;
+  for (i = 0; build_id[i] != '\0'; i++)
+    if ((build_id[i] >= '0' && build_id[i] <= '9') ||
+        (build_id[i] >= 'a' && build_id[i] <= 'f'))
+      ;
+    else
+      any_non_hex = 1;
+
+  int fd = -1;
+  Elf* elf = NULL;
+  if (any_non_hex) /* raw build-id */
+    {
+      fd = open ((char*) build_id, O_RDONLY);
+      if (fd < 0)
+        fprintf (stderr, "Cannot open %s: %s\n", build_id, strerror(errno));
+    }
+  if (fd >= 0)
+    {
+      elf = elf_begin (fd, ELF_C_READ_MMAP_PRIVATE, NULL);
+      if (elf == NULL)
+        fprintf (stderr, "Cannot elf_begin %s: %s\n", build_id, elf_errmsg(-1));
+    }
+  if (elf != NULL)
+    {
+      const void *extracted_build_id;
+      ssize_t s = dwelf_elf_gnu_build_id(elf, &extracted_build_id);
+      if (s > 0)
+        {
+          /* Success: replace the build_id pointer/len with the binary blob
+             that elfutils is keeping for us.  It'll remain valid until elf_end(). */
+          build_id = (unsigned char*) extracted_build_id;
+          build_id_len = s;
+        }
+      else
+        fprintf (stderr, "Cannot extract build-id from %s: %s\n", build_id, elf_errmsg(-1));
+    }
+
   char *cache_name;
+  int rc = 0;
 
   /* Check whether FILETYPE is valid and call the appropriate
      debuginfod_find_* function. If FILETYPE is "source"
      then ensure a FILENAME was also supplied as an argument.  */
   if (strcmp(argv[remaining], "debuginfo") == 0)
     rc = debuginfod_find_debuginfo(client,
-				   (unsigned char *)argv[remaining+1], 0,
+				   build_id, build_id_len,
 				   &cache_name);
   else if (strcmp(argv[remaining], "executable") == 0)
     rc = debuginfod_find_executable(client,
-				    (unsigned char *)argv[remaining+1], 0,
+                                    build_id, build_id_len,
 				    &cache_name);
   else if (strcmp(argv[remaining], "source") == 0)
     {
-      if (remaining+2 == argc || argv[3][0] != '/')
+      if (remaining+2 == argc || argv[remaining+2][0] != '/')
         {
           fprintf(stderr, "If FILETYPE is \"source\" then absolute /FILENAME must be given\n");
           return 1;
         }
-      rc = debuginfod_find_source(client, (unsigned char *)argv[remaining+1],
-				  0, argv[remaining+2], &cache_name);
+      rc = debuginfod_find_source(client,
+                                  build_id, build_id_len,
+				  argv[remaining+2], &cache_name);
     }
   else
     {
@@ -130,6 +188,19 @@ main(int argc, char** argv)
       return 1;
     }
 
+  if (verbose)
+    {
+      const char* url = debuginfod_get_url (client);
+      if (url != NULL)
+        fprintf(stderr, "Downloaded from %s\n", url);
+    }
+
+  debuginfod_end (client);
+  if (elf)
+    elf_end(elf);
+  if (fd >= 0)
+    close (fd);
+
   if (rc < 0)
     {
       fprintf(stderr, "Server query failed: %s\n", strerror(-rc));
@@ -137,9 +208,7 @@ main(int argc, char** argv)
     }
 
   printf("%s\n", cache_name);
-
   free (cache_name);
-  debuginfod_end (client);
 
   return 0;
 }
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index aa7ffcf6..76f1fa52 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -1,5 +1,5 @@
 /* Debuginfo-over-http server.
-   Copyright (C) 2019 Red Hat, Inc.
+   Copyright (C) 2019-2020 Red Hat, Inc.
    This file is part of elfutils.
 
    This file is free software; you can redistribute it and/or modify
@@ -52,6 +52,7 @@ extern "C" {
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/time.h>
+#include <sys/vfs.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <netdb.h>
@@ -79,6 +80,7 @@ extern "C" {
 #include <ostream>
 #include <sstream>
 #include <mutex>
+#include <deque>
 #include <condition_variable>
 #include <thread>
 // #include <regex> // on rhel7 gcc 4.8, not competent
@@ -106,6 +108,15 @@ using namespace std;
 #endif
 
 
+inline bool
+string_endswith(const string& haystack, const string& needle)
+{
+  return (haystack.size() >= needle.size() &&
+	  equal(haystack.end()-needle.size(), haystack.end(),
+                needle.begin()));
+}
+
+
 // Roll this identifier for every sqlite schema incompatiblity.
 #define BUILDIDS "buildids9"
 
@@ -231,9 +242,9 @@ static const char DEBUGINFOD_SQLITE_DDL[] =
   "create view if not exists " BUILDIDS "_stats as\n"
   "          select 'file d/e' as label,count(*) as quantity from " BUILDIDS "_f_de\n"
   "union all select 'file s',count(*) from " BUILDIDS "_f_s\n"
-  "union all select 'rpm d/e',count(*) from " BUILDIDS "_r_de\n"
-  "union all select 'rpm sref',count(*) from " BUILDIDS "_r_sref\n"
-  "union all select 'rpm sdef',count(*) from " BUILDIDS "_r_sdef\n"
+  "union all select 'archive d/e',count(*) from " BUILDIDS "_r_de\n"
+  "union all select 'archive sref',count(*) from " BUILDIDS "_r_sref\n"
+  "union all select 'archive sdef',count(*) from " BUILDIDS "_r_sdef\n"
   "union all select 'buildids',count(*) from " BUILDIDS "_buildids\n"
   "union all select 'filenames',count(*) from " BUILDIDS "_files\n"
   "union all select 'files scanned (#)',count(*) from " BUILDIDS "_file_mtime_scanned\n"
@@ -322,9 +333,11 @@ ARGP_PROGRAM_BUG_ADDRESS_DEF = PACKAGE_BUGREPORT;
 static const struct argp_option options[] =
   {
    { NULL, 0, NULL, 0, "Scanners:", 1 },
-   { "scan-file-dir", 'F', NULL, 0, "Enable ELF/DWARF file scanning threads.", 0 },
-   { "scan-rpm-dir", 'R', NULL, 0, "Enable RPM scanning threads.", 0 },
-   // "source-oci-imageregistry"  ... 
+   { "scan-file-dir", 'F', NULL, 0, "Enable ELF/DWARF file scanning.", 0 },
+   { "scan-rpm-dir", 'R', NULL, 0, "Enable RPM scanning.", 0 },
+   { "scan-deb-dir", 'U', NULL, 0, "Enable DEB scanning.", 0 },
+   { "scan-archive", 'Z', "EXT=CMD", 0, "Enable arbitrary archive scanning.", 0 },
+   // "source-oci-imageregistry"  ...
 
    { NULL, 0, NULL, 0, "Options:", 2 },
    { "logical", 'L', NULL, 0, "Follow symlinks, default=ignore.", 0 },
@@ -338,7 +351,12 @@ static const struct argp_option options[] =
    { "database", 'd', "FILE", 0, "Path to sqlite database.", 0 },
    { "ddl", 'D', "SQL", 0, "Apply extra sqlite ddl/pragma to connection.", 0 },
    { "verbose", 'v', NULL, 0, "Increase verbosity.", 0 },
-
+#define ARGP_KEY_FDCACHE_FDS 0x1001
+   { "fdcache-fds", ARGP_KEY_FDCACHE_FDS, "NUM", 0, "Maximum number of archive files to keep in fdcache.", 0 },
+#define ARGP_KEY_FDCACHE_MBS 0x1002
+   { "fdcache-mbs", ARGP_KEY_FDCACHE_MBS, "MB", 0, "Maximum total size of archive file fdcache.", 0 },
+#define ARGP_KEY_FDCACHE_PREFETCH 0x1003
+   { "fdcache-prefetch", ARGP_KEY_FDCACHE_PREFETCH, "NUM", 0, "Number of archive files to prefetch into fdcache.", 0 },
    { NULL, 0, NULL, 0, NULL, 0 }
   };
 
@@ -359,7 +377,7 @@ static struct argp argp =
 
 
 static string db_path;
-static sqlite3 *db;
+static sqlite3 *db; // single connection, serialized across all our threads!
 static unsigned verbose;
 static volatile sig_atomic_t interrupted = 0;
 static volatile sig_atomic_t sigusr1 = 0;
@@ -367,15 +385,19 @@ static volatile sig_atomic_t sigusr2 = 0;
 static unsigned http_port = 8002;
 static unsigned rescan_s = 300;
 static unsigned groom_s = 86400;
-static unsigned maxigroom = false;
+static bool maxigroom = false;
 static unsigned concurrency = std::thread::hardware_concurrency() ?: 1;
 static set<string> source_paths;
 static bool scan_files = false;
-static bool scan_rpms = false;
+static map<string,string> scan_archives;
 static vector<string> extra_ddl;
 static regex_t file_include_regex;
 static regex_t file_exclude_regex;
 static bool traverse_logical;
+static long fdcache_fds;
+static long fdcache_mbs;
+static long fdcache_prefetch;
+static string tmpdir;
 
 static void set_metric(const string& key, int64_t value);
 // static void inc_metric(const string& key);
@@ -387,6 +409,7 @@ static void inc_metric(const string& metric,
 static void add_metric(const string& metric,
                        const string& lname, const string& lvalue,
                        int64_t value);
+// static void add_metric(const string& metric, int64_t value);
 
 /* Handle program arguments.  */
 static error_t
@@ -399,10 +422,37 @@ parse_opt (int key, char *arg,
     case 'v': verbose ++; break;
     case 'd': db_path = string(arg); break;
     case 'p': http_port = (unsigned) atoi(arg);
-      if (http_port > 65535) argp_failure(state, 1, EINVAL, "port number");
+      if (http_port == 0 || http_port > 65535)
+        argp_failure(state, 1, EINVAL, "port number");
       break;
     case 'F': scan_files = true; break;
-    case 'R': scan_rpms = true; break;
+    case 'R':
+      scan_archives[".rpm"]="cat"; // libarchive groks rpm natively
+      break;
+    case 'U':
+      if (access("/usr/bin/dpkg-deb", X_OK) == 0)
+        {
+          scan_archives[".deb"]="dpkg-deb --fsys-tarfile";
+          scan_archives[".ddeb"]="dpkg-deb --fsys-tarfile";
+        }
+      else
+        {
+          scan_archives[".deb"]="(bsdtar -O -x -f - data.tar.xz)<";
+          scan_archives[".ddeb"]="(bsdtar -O -x -f - data.tar.xz)<";
+        }
+      // .udeb too?
+      break;
+    case 'Z':
+      {
+        char* extension = strchr(arg, '=');
+        if (arg[0] == '\0')
+          argp_failure(state, 1, EINVAL, "missing EXT");
+        else if (extension)
+          scan_archives[string(arg, (extension-arg))]=string(extension+1);
+        else
+          scan_archives[string(arg)]=string("cat");
+      }
+      break;
     case 'L':
       traverse_logical = true;
       break;
@@ -433,6 +483,15 @@ parse_opt (int key, char *arg,
       if (rc != 0)
         argp_failure(state, 1, EINVAL, "regular expession");
       break;
+    case ARGP_KEY_FDCACHE_FDS:
+      fdcache_fds = atol (arg);
+      break;
+    case ARGP_KEY_FDCACHE_MBS:
+      fdcache_mbs = atol (arg);
+      break;
+    case ARGP_KEY_FDCACHE_PREFETCH:
+      fdcache_prefetch = atol (arg);
+      break;
     case ARGP_KEY_ARG:
       source_paths.insert(string(arg));
       break;
@@ -503,39 +562,84 @@ struct elfutils_exception: public reportable_exception
 
 ////////////////////////////////////////////////////////////////////////
 
-// a c++ counting-semaphore class ... since we're c++11 not c++20
-
-class semaphore
+template <typename Payload>
+class workq
 {
+  set<Payload> q; // eliminate duplicates
+  mutex mtx;
+  condition_variable cv;
+  bool dead;
+  unsigned idlers;
+
 public:
-  semaphore (unsigned c=1): count(c) {}
-  inline void notify () {
+  workq() { dead = false; idlers = 0; }
+  ~workq() {}
+
+  void push_back(const Payload& p)
+  {
     unique_lock<mutex> lock(mtx);
-    count++;
-    cv.notify_one();
+    q.insert (p);
+    set_metric("thread_work_pending","role","scan", q.size());
+    cv.notify_all();
   }
-  inline void wait() {
+
+  // kill this workqueue, wake up all idlers / scanners
+  void nuke() {
     unique_lock<mutex> lock(mtx);
-    while (count == 0)
+    // optional: q.clear();
+    dead = true;
+    cv.notify_all();
+  }
+
+  // block this scanner thread until there is work to do and no active
+  bool wait_front (Payload& p)
+  {
+    unique_lock<mutex> lock(mtx);
+    while (!dead && (q.size() == 0 || idlers > 0))
       cv.wait(lock);
-    count--;
+    if (dead)
+      return false;
+    else
+      {
+        p = * q.begin();
+        q.erase (q.begin());
+        set_metric("thread_work_pending","role","scan", q.size());
+        if (q.size() == 0)
+          cv.notify_all(); // maybe wake up waiting idlers
+        return true;
+      }
   }
-private:
-  mutex mtx;
-  condition_variable cv;
-  unsigned count;
-};
 
+  // block this idler thread until there is no work to do
+  void wait_idle ()
+  {
+    unique_lock<mutex> lock(mtx);
+    cv.notify_all(); // maybe wake up waiting scanners
+    while (!dead && (q.size() != 0))
+      cv.wait(lock);
+    idlers ++;
+  }
 
-class semaphore_borrower
-{
-public:
-  semaphore_borrower(semaphore* s): sem(s) { sem->wait(); }
-  ~semaphore_borrower() { sem->notify(); }
-private:
-  semaphore* sem;
+  void done_idle ()
+  {
+    unique_lock<mutex> lock(mtx);
+    idlers --;
+    cv.notify_all(); // maybe wake up waiting scanners, but probably not (shutting down)
+  }
 };
 
+typedef struct stat stat_t;
+typedef pair<string,stat_t> scan_payload;
+inline bool operator< (const scan_payload& a, const scan_payload& b)
+{
+  return a.first < b.first; // don't bother compare the stat fields
+}
+static workq<scan_payload> scanq; // just a single one
+// producer & idler: thread_main_fts_source_paths()
+// consumer: thread_main_scanner()
+// idler: thread_main_groom()
+
+
 
 ////////////////////////////////////////////////////////////////////////
 
@@ -706,7 +810,17 @@ private:
 ////////////////////////////////////////////////////////////////////////
 
 
-
+static string
+header_censor(const string& str)
+{
+  string y;
+  for (auto&& x : str)
+    {
+      if (isalnum(x) || x == '/' || x == '.' || x == ',' || x == '_' || x == ':')
+        y += x;
+    }
+  return y;
+}
 
 
 static string
@@ -735,13 +849,21 @@ conninfo (struct MHD_Connection * conn)
     hostname[0] = servname[0] = '\0';
   }
 
-  return string(hostname) + string(":") + string(servname);
+  // extract headers relevant to administration
+  const char* user_agent = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "User-Agent") ?: "";
+  const char* x_forwarded_for = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "X-Forwarded-For") ?: "";
+  // NB: these are untrustworthy, beware if machine-processing log files
+
+  return string(hostname) + string(":") + string(servname) +
+    string(" UA:") + header_censor(string(user_agent)) +
+    string(" XFF:") + header_censor(string(x_forwarded_for));
 }
 
 
 
 ////////////////////////////////////////////////////////////////////////
 
+
 static void
 add_mhd_last_modified (struct MHD_Response *resp, time_t mtime)
 {
@@ -833,6 +955,268 @@ shell_escape(const string& str)
 }
 
 
+// PR25548: Perform POSIX / RFC3986 style path canonicalization on the input string.
+//
+// Namely:
+//    //         ->   /
+//    /foo/../   ->   /
+//    /./        ->   /
+//
+// This mapping is done on dwarf-side source path names, which may
+// include these constructs, so we can deal with debuginfod clients
+// that accidentally canonicalize the paths.
+//
+// realpath(3) is close but not quite right, because it also resolves
+// symbolic links.  Symlinks at the debuginfod server have nothing to
+// do with the build-time symlinks, thus they must not be considered.
+//
+// see also curl Curl_dedotdotify() aka RFC3986, which we mostly follow here
+// see also libc __realpath()
+// see also llvm llvm::sys::path::remove_dots()
+static string
+canon_pathname (const string& input)
+{
+  string i = input; // 5.2.4 (1)
+  string o;
+
+  while (i.size() != 0)
+    {
+      // 5.2.4 (2) A
+      if (i.substr(0,3) == "../")
+        i = i.substr(3);
+      else if(i.substr(0,2) == "./")
+        i = i.substr(2);
+
+      // 5.2.4 (2) B
+      else if (i.substr(0,3) == "/./")
+        i = i.substr(2);
+      else if (i == "/.")
+        i = ""; // no need to handle "/." complete-path-segment case; we're dealing with file names
+
+      // 5.2.4 (2) C
+      else if (i.substr(0,4) == "/../") {
+        i = i.substr(3);
+        string::size_type sl = o.rfind("/");
+        if (sl != string::npos)
+          o = o.substr(0, sl);
+        else
+          o = "";
+      } else if (i == "/..")
+        i = ""; // no need to handle "/.." complete-path-segment case; we're dealing with file names
+
+      // 5.2.4 (2) D
+      // no need to handle these cases; we're dealing with file names
+      else if (i == ".")
+        i = "";
+      else if (i == "..")
+        i = "";
+
+      // POSIX special: map // to /
+      else if (i.substr(0,2) == "//")
+        i = i.substr(1);
+
+      // 5.2.4 (2) E
+      else {
+        string::size_type next_slash = i.find("/", (i[0]=='/' ? 1 : 0)); // skip first slash
+        o += i.substr(0, next_slash);
+        if (next_slash == string::npos)
+          i = "";
+        else
+          i = i.substr(next_slash);
+      }
+    }
+
+  return o;
+}
+
+
+
+// A map-like class that owns a cache of file descriptors (indexed by
+// file / content names).
+//
+// If only it could use fd's instead of file names ... but we can't
+// dup(2) to create independent descriptors for the same unlinked
+// files, so would have to use some goofy linux /proc/self/fd/%d
+// hack such as the following
+
+#if 0
+int superdup(int fd)
+{
+#ifdef __linux__
+  char *fdpath = NULL;
+  int rc = asprintf(& fdpath, "/proc/self/fd/%d", fd);
+  int newfd;
+  if (rc >= 0)
+    newfd = open(fdpath, O_RDONLY);
+  else
+    newfd = -1;
+  free (fdpath);
+  return newfd;
+#else
+  return -1;
+#endif
+}
+#endif
+
+class libarchive_fdcache
+{
+private:
+  mutex fdcache_lock;
+
+  struct fdcache_entry
+  {
+    string archive;
+    string entry;
+    string fd;
+    double fd_size_mb; // slightly rounded up megabytes
+  };
+  deque<fdcache_entry> lru; // @head: most recently used
+  long max_fds;
+  long max_mbs;
+
+public:
+  void intern(const string& a, const string& b, string fd, off_t sz, bool front_p)
+  {
+    {
+      unique_lock<mutex> lock(fdcache_lock);
+      for (auto i = lru.begin(); i < lru.end(); i++) // nuke preexisting copy
+        {
+          if (i->archive == a && i->entry == b)
+            {
+              unlink (i->fd.c_str());
+              lru.erase(i);
+              break; // must not continue iterating
+            }
+        }
+      double mb = (sz+65535)/1048576.0; // round up to 64K block
+      fdcache_entry n = { a, b, fd, mb };
+      if (front_p)
+        lru.push_front(n);
+      else
+        lru.push_back(n);
+    if (verbose > 3)
+      obatched(clog) << "fdcache interned a=" << a << " b=" << b
+                     << " fd=" << fd << " mb=" << mb << " front=" << front_p << endl;
+    }
+
+    // NB: we age the cache at lookup time too
+    if (front_p)
+      this->limit(max_fds, max_mbs); // age cache if required
+  }
+
+  int lookup(const string& a, const string& b)
+  {
+    int fd = -1;
+    {
+      unique_lock<mutex> lock(fdcache_lock);
+      for (auto i = lru.begin(); i < lru.end(); i++)
+        {
+          if (i->archive == a && i->entry == b)
+            { // found it; move it to head of lru
+              fdcache_entry n = *i;
+              lru.erase(i); // invalidates i, so no more iteration!
+              lru.push_front(n);
+
+              fd = open(n.fd.c_str(), O_RDONLY); // NB: no problem if dup() fails; looks like cache miss
+              break;
+            }
+        }
+    }
+
+    if (fd >= 0)
+      this->limit(max_fds, max_mbs); // age cache if required
+
+    return fd;
+  }
+
+  int probe(const string& a, const string& b) // just a cache residency check - don't modify LRU state, don't open
+  {
+    unique_lock<mutex> lock(fdcache_lock);
+    for (auto i = lru.begin(); i < lru.end(); i++)
+      {
+        if (i->archive == a && i->entry == b)
+          return true;
+      }
+    return false;
+  }
+
+  void clear(const string& a, const string& b)
+  {
+    unique_lock<mutex> lock(fdcache_lock);
+    for (auto i = lru.begin(); i < lru.end(); i++)
+      {
+        if (i->archive == a && i->entry == b)
+          { // found it; move it to head of lru
+            fdcache_entry n = *i;
+            lru.erase(i); // invalidates i, so no more iteration!
+            unlink (n.fd.c_str());
+            return;
+          }
+      }
+  }
+
+  void limit(long maxfds, long maxmbs)
+  {
+    if (verbose > 3 && (this->max_fds != maxfds || this->max_mbs != maxmbs))
+      obatched(clog) << "fdcache limited to maxfds=" << maxfds << " maxmbs=" << maxmbs << endl;
+
+    unique_lock<mutex> lock(fdcache_lock);
+    this->max_fds = maxfds;
+    this->max_mbs = maxmbs;
+
+    long total_fd = 0;
+    double total_mb = 0.0;
+    for (auto i = lru.begin(); i < lru.end(); i++)
+      {
+        // accumulate totals from most recently used one going backward
+        total_fd ++;
+        total_mb += i->fd_size_mb;
+        if (total_fd > max_fds || total_mb > max_mbs)
+          {
+            // found the cut here point!
+
+            for (auto j = i; j < lru.end(); j++) // close all the fds from here on in
+              {
+                if (verbose > 3)
+                  obatched(clog) << "fdcache evicted a=" << j->archive << " b=" << j->entry
+                                 << " fd=" << j->fd << " mb=" << j->fd_size_mb << endl;
+                unlink (j->fd.c_str());
+              }
+
+            lru.erase(i, lru.end()); // erase the nodes generally
+            break;
+          }
+
+      }
+  }
+
+  ~libarchive_fdcache()
+  {
+    limit(0, 0);
+  }
+};
+static libarchive_fdcache fdcache;
+
+
+// For security/portability reasons, many distro-package archives have
+// a "./" in front of path names; others have nothing, others have
+// "/".  Canonicalize them all to a single leading "/", with the
+// assumption that this matches the dwarf-derived file names too.
+string canonicalized_archive_entry_pathname(struct archive_entry *e)
+{
+  string fn = archive_entry_pathname(e);
+  if (fn.size() == 0)
+    return fn;
+  if (fn[0] == '/')
+    return fn;
+  if (fn[0] == '.')
+    return fn.substr(1);
+  else
+    return string("/")+fn;
+}
+
+
+
 static struct MHD_Response*
 handle_buildid_r_match (int64_t b_mtime,
                         const string& b_source0,
@@ -851,11 +1235,69 @@ handle_buildid_r_match (int64_t b_mtime,
       return 0;
     }
 
-  string popen_cmd = string("rpm2cpio " + shell_escape(b_source0));
-  FILE* fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
-  if (fp == NULL)
-    throw libc_exception (errno, string("popen ") + popen_cmd);
-  defer_dtor<FILE*,int> fp_closer (fp, pclose);
+  // check for a match in the fdcache first
+  int fd = fdcache.lookup(b_source0, b_source1);
+  while (fd >= 0) // got one!; NB: this is really an if() with a possible branch out to the end
+    {
+      rc = fstat(fd, &fs);
+      if (rc < 0) // disappeared?
+        {
+          if (verbose)
+            obatched(clog) << "cannot fstat fdcache " << b_source0 << endl;
+          close(fd);
+          fdcache.clear(b_source0, b_source1);
+          break; // branch out of if "loop", to try new libarchive fetch attempt
+        }
+
+      struct MHD_Response* r = MHD_create_response_from_fd (fs.st_size, fd);
+      if (r == 0)
+        {
+          if (verbose)
+            obatched(clog) << "cannot create fd-response for " << b_source0 << endl;
+          close(fd);
+          break; // branch out of if "loop", to try new libarchive fetch attempt
+        }
+
+      inc_metric ("http_responses_total","result","archive fdcache");
+
+      MHD_add_response_header (r, "Content-Type", "application/octet-stream");
+      add_mhd_last_modified (r, fs.st_mtime);
+      if (verbose > 1)
+        obatched(clog) << "serving fdcache archive " << b_source0 << " file " << b_source1 << endl;
+      /* libmicrohttpd will close it. */
+      if (result_fd)
+        *result_fd = fd;
+      return r;
+      // NB: see, we never go around the 'loop' more than once
+    }
+
+  // no match ... grumble, must process the archive
+  string archive_decoder = "/dev/null";
+  string archive_extension = "";
+  for (auto&& arch : scan_archives)
+    if (string_endswith(b_source0, arch.first))
+      {
+        archive_extension = arch.first;
+        archive_decoder = arch.second;
+      }
+  FILE* fp;
+  defer_dtor<FILE*,int>::dtor_fn dfn;
+  if (archive_decoder != "cat")
+    {
+      string popen_cmd = archive_decoder + " " + shell_escape(b_source0);
+      fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
+      dfn = pclose;
+      if (fp == NULL)
+        throw libc_exception (errno, string("popen ") + popen_cmd);
+    }
+  else
+    {
+      fp = fopen (b_source0.c_str(), "r");
+      dfn = fclose;
+      if (fp == NULL)
+        throw libc_exception (errno, string("fopen ") + b_source0);
+    }
+  defer_dtor<FILE*,int> fp_closer (fp, dfn);
 
   struct archive *a;
   a = archive_read_new();
@@ -863,19 +1305,30 @@ handle_buildid_r_match (int64_t b_mtime,
     throw archive_exception("cannot create archive reader");
   defer_dtor<struct archive*,int> archive_closer (a, archive_read_free);
 
-  rc = archive_read_support_format_cpio(a);
+  rc = archive_read_support_format_all(a);
   if (rc != ARCHIVE_OK)
-    throw archive_exception(a, "cannot select cpio format");
+    throw archive_exception(a, "cannot select all format");
   rc = archive_read_support_filter_all(a);
   if (rc != ARCHIVE_OK)
     throw archive_exception(a, "cannot select all filters");
 
   rc = archive_read_open_FILE (a, fp);
   if (rc != ARCHIVE_OK)
-    throw archive_exception(a, "cannot open archive from rpm2cpio pipe");
+    throw archive_exception(a, "cannot open archive from pipe");
 
-  while(1) // parse cpio archive entries
+  // archive traversal is in three stages, no, four stages:
+  // 1) skip entries whose names do not match the requested one
+  // 2) extract the matching entry name (set r = result)
+  // 3) extract some number of prefetched entries (just into fdcache)
+  // 4) abort any further processing
+  struct MHD_Response* r = 0;                 // will set in stage 2
+  unsigned prefetch_count = fdcache_prefetch; // will decrement in stage 3
+
+  while(r == 0 || prefetch_count > 0) // stage 1, 2, or 3
     {
+      if (interrupted)
+        break;
+
       struct archive_entry *e;
       rc = archive_read_next_header (a, &e);
       if (rc != ARCHIVE_OK)
@@ -884,48 +1337,81 @@ handle_buildid_r_match (int64_t b_mtime,
       if (! S_ISREG(archive_entry_mode (e))) // skip non-files completely
         continue;
 
-      string fn = archive_entry_pathname (e);
-      if (fn != string(".")+b_source1)
+      string fn = canonicalized_archive_entry_pathname (e);
+      if ((r == 0) && (fn != b_source1)) // stage 1
+        continue;
+
+      if (fdcache.probe (b_source0, fn)) // skip if already interned
         continue;
 
       // extract this file to a temporary file
-      char tmppath[PATH_MAX] = "/tmp/debuginfod.XXXXXX"; // XXX: $TMP_DIR etc.
-      int fd = mkstemp (tmppath);
+      char* tmppath = NULL;
+      rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir.c_str());
+      if (rc < 0)
+        throw libc_exception (ENOMEM, "cannot allocate tmppath");
+      defer_dtor<void*,void> tmmpath_freer (tmppath, free);
+      fd = mkstemp (tmppath);
       if (fd < 0)
         throw libc_exception (errno, "cannot create temporary file");
-      unlink (tmppath); // unlink now so OS will release the file as soon as we close the fd
+      // NB: don't unlink (tmppath), as fdcache will take charge of it.
 
       rc = archive_read_data_into_fd (a, fd);
-      if (rc != ARCHIVE_OK)
+      if (rc != ARCHIVE_OK) // e.g. ENOSPC!
         {
           close (fd);
+          unlink (tmppath);
           throw archive_exception(a, "cannot extract file");
         }
 
-      inc_metric ("http_responses_total","result","rpm");
-      struct MHD_Response* r = MHD_create_response_from_fd (archive_entry_size(e), fd);
+      // Set the mtime so the fdcache file mtimes, even prefetched ones,
+      // propagate to future webapi clients.
+      struct timeval tvs[2];
+      tvs[0].tv_sec = tvs[1].tv_sec = archive_entry_mtime(e);
+      tvs[0].tv_usec = tvs[1].tv_usec = 0;
+      (void) futimes (fd, tvs);  /* best effort */
+
+      if (r != 0) // stage 3
+        {
+          // NB: now we know we have a complete reusable file; make fdcache
+          // responsible for unlinking it later.
+          fdcache.intern(b_source0, fn,
+                         tmppath, archive_entry_size(e),
+                         false); // prefetched ones go to back of lru
+          prefetch_count --;
+          close (fd); // we're not saving this fd to make a mhd-response from!
+          continue;
+        }
+
+      // NB: now we know we have a complete reusable file; make fdcache
+      // responsible for unlinking it later.
+      fdcache.intern(b_source0, b_source1,
+                     tmppath, archive_entry_size(e),
+                     true); // requested ones go to the front of lru
+
+      inc_metric ("http_responses_total","result",archive_extension + " archive");
+      r = MHD_create_response_from_fd (archive_entry_size(e), fd);
       if (r == 0)
         {
           if (verbose)
             obatched(clog) << "cannot create fd-response for " << b_source0 << endl;
           close(fd);
-          break; // assume no chance of better luck around another iteration
+          break; // assume no chance of better luck around another iteration; no other copies of same file
         }
       else
         {
           MHD_add_response_header (r, "Content-Type", "application/octet-stream");
           add_mhd_last_modified (r, archive_entry_mtime(e));
           if (verbose > 1)
-            obatched(clog) << "serving rpm " << b_source0 << " file " << b_source1 << endl;
+            obatched(clog) << "serving archive " << b_source0 << " file " << b_source1 << endl;
           /* libmicrohttpd will close it. */
           if (result_fd)
             *result_fd = fd;
-          return r;
+          continue;
         }
     }
 
   // XXX: rpm/file not found: delete this R entry?
-  return 0;
+  return r;
 }
 
 
@@ -955,11 +1441,12 @@ debuginfod_find_progress (debuginfod_client *, long a, long b)
 }
 
 
-static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
-                                            const string& artifacttype /* unsafe */,
-                                            const string& suffix /* unsafe */,
-                                            int *result_fd
-                                            )
+static struct MHD_Response*
+handle_buildid (MHD_Connection* conn,
+                const string& buildid /* unsafe */,
+                const string& artifacttype /* unsafe */,
+                const string& suffix /* unsafe */,
+                int *result_fd)
 {
   // validate artifacttype
   string atype_code;
@@ -1001,12 +1488,17 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
     }
   else if (atype_code == "S")
     {
+      // PR25548
+      // Incoming source queries may come in with either dwarf-level OR canonicalized paths.
+      // We let the query pass with either one.
+
       pp = new sqlite_ps (db, "mhd-query-s",
-                          "select mtime, sourcetype, source0, source1 from " BUILDIDS "_query_s where buildid = ? and artifactsrc = ? "
+                          "select mtime, sourcetype, source0, source1 from " BUILDIDS "_query_s where buildid = ? and artifactsrc in (?,?) "
                           "order by sharedprefix(source0,source0ref) desc, mtime desc");
       pp->reset();
       pp->bind(1, buildid);
       pp->bind(2, suffix);
+      pp->bind(3, canon_pathname(suffix));
     }
   unique_ptr<sqlite_ps> ps_closer(pp); // release pp if exception or return
 
@@ -1043,6 +1535,35 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
     {
       debuginfod_set_progressfn (client, & debuginfod_find_progress);
 
+      if (conn)
+        {
+          // Transcribe incoming User-Agent:
+          string ua = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "User-Agent") ?: "";
+          string ua_complete = string("User-Agent: ") + ua;
+          debuginfod_add_http_header (client, ua_complete.c_str());
+
+          // Compute larger XFF:, for avoiding info loss during
+          // federation, and for future cyclicity detection.
+          string xff = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "X-Forwarded-For") ?: "";
+          if (xff != "")
+            xff += string(", "); // comma separated list
+
+          // Compute the client's numeric IP address only - so can't merge with conninfo()
+          const union MHD_ConnectionInfo *u = MHD_get_connection_info (conn,
+                                                                       MHD_CONNECTION_INFO_CLIENT_ADDRESS);
+          struct sockaddr *so = u ? u->client_addr : 0;
+          char hostname[256] = ""; // RFC1035
+          if (so && so->sa_family == AF_INET)
+            (void) getnameinfo (so, sizeof (struct sockaddr_in), hostname, sizeof (hostname), NULL, 0,
+                                NI_NUMERICHOST);
+          else if (so && so->sa_family == AF_INET6)
+            (void) getnameinfo (so, sizeof (struct sockaddr_in6), hostname, sizeof (hostname), NULL, 0,
+                                NI_NUMERICHOST);
+
+          string xff_complete = string("X-Forwarded-For: ")+xff+string(hostname);
+          debuginfod_add_http_header (client, xff_complete.c_str());
+        }
+
       if (artifacttype == "debuginfo")
 	fd = debuginfod_find_debuginfo (client,
 					(const unsigned char*) buildid.c_str(),
@@ -1081,8 +1602,16 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
         }
       close (fd);
     }
-  else if (fd != -ENOSYS) // no DEBUGINFOD_URLS configured
-    throw libc_exception(-fd, "upstream debuginfod query failed");
+  else
+    switch(fd)
+      {
+      case -ENOSYS:
+        break;
+      case -ENOENT:
+        break;
+      default: // some more tricky error
+        throw libc_exception(-fd, "upstream debuginfod query failed");
+      }
 
   throw reportable_exception(MHD_HTTP_NOT_FOUND, "not found");
 }
@@ -1157,13 +1686,22 @@ add_metric(const string& metric,
   unique_lock<mutex> lock(metrics_lock);
   metrics[key] += value;
 }
+#if 0
+static void
+add_metric(const string& metric,
+           int64_t value)
+{
+  unique_lock<mutex> lock(metrics_lock);
+  metrics[metric] += value;
+}
+#endif
 
 
 // and more for higher arity labels if needed
 
 
 static struct MHD_Response*
-handle_metrics ()
+handle_metrics (off_t* size)
 {
   stringstream o;
   {
@@ -1175,6 +1713,7 @@ handle_metrics ()
   MHD_Response* r = MHD_create_response_from_buffer (os.size(),
                                                      (void*) os.c_str(),
                                                      MHD_RESPMEM_MUST_COPY);
+  *size = os.size();
   MHD_add_response_header (r, "Content-Type", "text/plain");
   return r;
 }
@@ -1197,8 +1736,11 @@ handler_cb (void * /*cls*/,
   struct MHD_Response *r = NULL;
   string url_copy = url;
 
-  if (verbose)
-    obatched(clog) << conninfo(connection) << " " << method << " " << url << endl;
+  int rc = MHD_NO; // mhd
+  int http_code = 500;
+  off_t http_size = -1;
+  struct timeval tv_start, tv_end;
+  gettimeofday (&tv_start, NULL);
 
   try
     {
@@ -1231,12 +1773,21 @@ handler_cb (void * /*cls*/,
             }
 
           inc_metric("http_requests_total", "type", artifacttype);
-          r = handle_buildid(buildid, artifacttype, suffix, 0); // NB: don't care about result-fd
+          // get the resulting fd so we can report its size
+          int fd;
+          r = handle_buildid(connection, buildid, artifacttype, suffix, &fd);
+          if (r)
+            {
+              struct stat fs;
+              if (fstat(fd, &fs) == 0)
+                http_size = fs.st_size;
+              // libmicrohttpd will close (fd);
+            }
         }
       else if (url1 == "/metrics")
         {
           inc_metric("http_requests_total", "type", "metrics");
-          r = handle_metrics();
+          r = handle_metrics(& http_size);
         }
       else
         throw reportable_exception("webapi error, unrecognized /operation");
@@ -1244,16 +1795,39 @@ handler_cb (void * /*cls*/,
       if (r == 0)
         throw reportable_exception("internal error, missing response");
 
-      int rc = MHD_queue_response (connection, MHD_HTTP_OK, r);
+      rc = MHD_queue_response (connection, MHD_HTTP_OK, r);
+      http_code = MHD_HTTP_OK;
       MHD_destroy_response (r);
-      return rc;
     }
   catch (const reportable_exception& e)
     {
       inc_metric("http_responses_total","result","error");
       e.report(clog);
-      return e.mhd_send_response (connection);
+      http_code = e.code;
+      http_size = e.message.size();
+      rc = e.mhd_send_response (connection);
     }
+
+  gettimeofday (&tv_end, NULL);
+  double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
+  obatched(clog) << conninfo(connection)
+                 << ' ' << method << ' ' << url
+                 << ' ' << http_code << ' ' << http_size
+                 << ' ' << (int)(deltas*1000) << "ms"
+                 << endl;
+
+  // related prometheus metrics
+  string http_code_str = to_string(http_code);
+  if (http_size >= 0)
+    add_metric("http_responses_transfer_bytes_sum","code",http_code_str,
+               http_size);
+  inc_metric("http_responses_transfer_bytes_count","code",http_code_str);
+
+  add_metric("http_responses_duration_milliseconds_sum","code",http_code_str,
+             deltas*1000); // prometheus prefers _seconds and floating point
+  inc_metric("http_responses_duration_milliseconds_count","code",http_code_str);
+
+  return rc;
 }
 
 
@@ -1300,7 +1874,7 @@ dwarf_extract_source_paths (Elf *elf, set<string>& debug_sourcefiles)
           struct MHD_Response *r = 0;
           try
             {
-              r = handle_buildid (buildid, "debuginfo", "", &alt_fd);
+              r = handle_buildid (0, buildid, "debuginfo", "", &alt_fd);
             }
           catch (const reportable_exception& e)
             {
@@ -1399,7 +1973,8 @@ dwarf_extract_source_paths (Elf *elf, set<string>& debug_sourcefiles)
             waldo = (string (comp_dir) + string("/") + string (hat));
           else
            {
-             obatched(clog) << "skipping hat=" << hat << " due to empty comp_dir" << endl;
+             if (verbose > 3)
+               obatched(clog) << "skipping hat=" << hat << " due to empty comp_dir" << endl;
              continue;
            }
 
@@ -1544,334 +2119,222 @@ elf_classify (int fd, bool &executable_p, bool &debuginfo_p, string &buildid, se
 }
 
 
-static semaphore* scan_concurrency_sem = 0; // used to implement -c load limiting
-
-
 static void
-scan_source_file_path (const string& dir)
+scan_source_file (const string& rps, const stat_t& st,
+                  sqlite_ps& ps_upsert_buildids,
+                  sqlite_ps& ps_upsert_files,
+                  sqlite_ps& ps_upsert_de,
+                  sqlite_ps& ps_upsert_s,
+                  sqlite_ps& ps_query,
+                  sqlite_ps& ps_scan_done,
+                  unsigned& fts_cached,
+                  unsigned& fts_executable,
+                  unsigned& fts_debuginfo,
+                  unsigned& fts_sourcefiles)
 {
-  obatched(clog) << "fts/file traversing " << dir << endl;
+  /* See if we know of it already. */
+  int rc = ps_query
+    .reset()
+    .bind(1, rps)
+    .bind(2, st.st_mtime)
+    .step();
+  ps_query.reset();
+  if (rc == SQLITE_ROW) // i.e., a result, as opposed to DONE (no results)
+    // no need to recheck a file/version we already know
+    // specifically, no need to elf-begin a file we already determined is non-elf
+    // (so is stored with buildid=NULL)
+    {
+      fts_cached++;
+      return;
+    }
 
-  struct timeval tv_start, tv_end;
-  gettimeofday (&tv_start, NULL);
-
-  sqlite_ps ps_upsert_buildids (db, "file-buildids-intern", "insert or ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
-  sqlite_ps ps_upsert_files (db, "file-files-intern", "insert or ignore into " BUILDIDS "_files VALUES (NULL, ?);");
-  sqlite_ps ps_upsert_de (db, "file-de-upsert",
-                          "insert or ignore into " BUILDIDS "_f_de "
-                          "(buildid, debuginfo_p, executable_p, file, mtime) "
-                          "values ((select id from " BUILDIDS "_buildids where hex = ?),"
-                          "        ?,?,"
-                          "        (select id from " BUILDIDS "_files where name = ?), ?);");
-  sqlite_ps ps_upsert_s (db, "file-s-upsert",
-                         "insert or ignore into " BUILDIDS "_f_s "
-                         "(buildid, artifactsrc, file, mtime) "
-                         "values ((select id from " BUILDIDS "_buildids where hex = ?),"
-                         "        (select id from " BUILDIDS "_files where name = ?),"
-                         "        (select id from " BUILDIDS "_files where name = ?),"
-                         "        ?);");
-  sqlite_ps ps_query (db, "file-negativehit-find",
-                      "select 1 from " BUILDIDS "_file_mtime_scanned where sourcetype = 'F' and file = (select id from " BUILDIDS "_files where name = ?) and mtime = ?;");
-  sqlite_ps ps_scan_done (db, "file-scanned",
-                          "insert or ignore into " BUILDIDS "_file_mtime_scanned (sourcetype, file, mtime, size)"
-                          "values ('F', (select id from " BUILDIDS "_files where name = ?), ?, ?);");
+  bool executable_p = false, debuginfo_p = false; // E and/or D
+  string buildid;
+  set<string> sourcefiles;
 
+  int fd = open (rps.c_str(), O_RDONLY);
+  try
+    {
+      if (fd >= 0)
+        elf_classify (fd, executable_p, debuginfo_p, buildid, sourcefiles);
+      else
+        throw libc_exception(errno, string("open ") + rps);
+      inc_metric ("scanned_total","source","file");
+    }
+  // NB: we catch exceptions here too, so that we can
+  // cache the corrupt-elf case (!executable_p &&
+  // !debuginfo_p) just below, just as if we had an
+  // EPERM error from open(2).
+  catch (const reportable_exception& e)
+    {
+      e.report(clog);
+    }
 
-  char * const dirs[] = { (char*) dir.c_str(), NULL };
+  if (fd >= 0)
+    close (fd);
 
-  unsigned fts_scanned=0, fts_regex=0, fts_cached=0, fts_debuginfo=0, fts_executable=0, fts_sourcefiles=0;
+  // register this file name in the interning table
+  ps_upsert_files
+    .reset()
+    .bind(1, rps)
+    .step_ok_done();
 
-  FTS *fts = fts_open (dirs,
-                       (traverse_logical ? FTS_LOGICAL : FTS_PHYSICAL|FTS_XDEV)
-                       | FTS_NOCHDIR /* multithreaded */,
-                       NULL);
-  if (fts == NULL)
+  if (buildid == "")
     {
-      obatched(cerr) << "cannot fts_open " << dir << endl;
-      return;
+      // no point storing an elf file without buildid
+      executable_p = false;
+      debuginfo_p = false;
     }
-
-  FTSENT *f;
-  while ((f = fts_read (fts)) != NULL)
+  else
     {
-      semaphore_borrower handle_one_file (scan_concurrency_sem);
+      // register this build-id in the interning table
+      ps_upsert_buildids
+        .reset()
+        .bind(1, buildid)
+        .step_ok_done();
+    }
 
-      fts_scanned ++;
-      if (interrupted)
-        break;
+  if (executable_p)
+    fts_executable ++;
+  if (debuginfo_p)
+    fts_debuginfo ++;
+  if (executable_p || debuginfo_p)
+    {
+      ps_upsert_de
+        .reset()
+        .bind(1, buildid)
+        .bind(2, debuginfo_p ? 1 : 0)
+        .bind(3, executable_p ? 1 : 0)
+        .bind(4, rps)
+        .bind(5, st.st_mtime)
+        .step_ok_done();
+    }
+  if (executable_p)
+    inc_metric("found_executable_total","source","files");
+  if (debuginfo_p)
+    inc_metric("found_debuginfo_total","source","files");
 
-      if (verbose > 2)
-        obatched(clog) << "fts/file traversing " << f->fts_path << endl;
+  if (sourcefiles.size() && buildid != "")
+    {
+      fts_sourcefiles += sourcefiles.size();
 
-      try
+      for (auto&& dwarfsrc : sourcefiles)
         {
-          /* Found a file.  Convert it to an absolute path, so
-             the buildid database does not have relative path
-             names that are unresolvable from a subsequent run
-             in a different cwd. */
-          char *rp = realpath(f->fts_path, NULL);
-          if (rp == NULL)
-            continue; // ignore dangling symlink or such
-          string rps = string(rp);
-          free (rp);
-
-          bool ri = !regexec (&file_include_regex, rps.c_str(), 0, 0, 0);
-          bool rx = !regexec (&file_exclude_regex, rps.c_str(), 0, 0, 0);
-          if (!ri || rx)
-            {
-              if (verbose > 3)
-                obatched(clog) << "fts/file skipped by regex " << (!ri ? "I" : "") << (rx ? "X" : "") << endl;
-              fts_regex ++;
-              continue;
-            }
-
-          switch (f->fts_info)
-            {
-            case FTS_D:
-              break;
-
-            case FTS_DP:
-              break;
-
-            case FTS_F:
-              {
-                /* See if we know of it already. */
-                int rc = ps_query
-                  .reset()
-                  .bind(1, rps)
-                  .bind(2, f->fts_statp->st_mtime)
-                  .step();
-                ps_query.reset();
-                if (rc == SQLITE_ROW) // i.e., a result, as opposed to DONE (no results)
-                  // no need to recheck a file/version we already know
-                  // specifically, no need to elf-begin a file we already determined is non-elf
-                  // (so is stored with buildid=NULL)
-                  {
-                    fts_cached ++;
-                    continue;
-                  }
-
-                bool executable_p = false, debuginfo_p = false; // E and/or D
-                string buildid;
-                set<string> sourcefiles;
-
-                int fd = open (rps.c_str(), O_RDONLY);
-                try
-                  {
-                    if (fd >= 0)
-                      elf_classify (fd, executable_p, debuginfo_p, buildid, sourcefiles);
-                    else
-                      throw libc_exception(errno, string("open ") + rps);
-                    inc_metric ("scanned_total","source","file");
-                  }
-
-                // NB: we catch exceptions here too, so that we can
-                // cache the corrupt-elf case (!executable_p &&
-                // !debuginfo_p) just below, just as if we had an
-                // EPERM error from open(2).
-
-                catch (const reportable_exception& e)
-                  {
-                    e.report(clog);
-                  }
-
-                if (fd >= 0)
-                  close (fd);
-
-                // register this file name in the interning table
-                ps_upsert_files
-                  .reset()
-                  .bind(1, rps)
-                  .step_ok_done();
-
-                if (buildid == "")
-                  {
-                    // no point storing an elf file without buildid
-                    executable_p = false;
-                    debuginfo_p = false;
-                  }
-                else
-                  {
-                    // register this build-id in the interning table
-                    ps_upsert_buildids
-                      .reset()
-                      .bind(1, buildid)
-                      .step_ok_done();
-                  }
-
-                if (executable_p)
-                  fts_executable ++;
-                if (debuginfo_p)
-                  fts_debuginfo ++;
-                if (executable_p || debuginfo_p)
-                  {
-                    ps_upsert_de
-                      .reset()
-                      .bind(1, buildid)
-                      .bind(2, debuginfo_p ? 1 : 0)
-                      .bind(3, executable_p ? 1 : 0)
-                      .bind(4, rps)
-                      .bind(5, f->fts_statp->st_mtime)
-                      .step_ok_done();
-                  }
-                if (executable_p)
-                  inc_metric("found_executable_total","source","files");
-                if (debuginfo_p)
-                  inc_metric("found_debuginfo_total","source","files");
-
-                if (sourcefiles.size() && buildid != "")
-                  {
-                    fts_sourcefiles += sourcefiles.size();
-
-                    for (auto&& dwarfsrc : sourcefiles)
-                      {
-                        char *srp = realpath(dwarfsrc.c_str(), NULL);
-                        if (srp == NULL) // also if DWZ unresolved dwarfsrc=""
-                          continue; // unresolvable files are not a serious problem
-                        // throw libc_exception(errno, "fts/file realpath " + srcpath);
-                        string srps = string(srp);
-                        free (srp);
-
-                        struct stat sfs;
-                        rc = stat(srps.c_str(), &sfs);
-                        if (rc != 0)
-                          continue;
-
-                        if (verbose > 2)
-                          obatched(clog) << "recorded buildid=" << buildid << " file=" << srps
-                                         << " mtime=" << sfs.st_mtime
-                                         << " as source " << dwarfsrc << endl;
-
-                        ps_upsert_files
-                          .reset()
-                          .bind(1, srps)
-                          .step_ok_done();
-
-                        // register the dwarfsrc name in the interning table too
-                        ps_upsert_files
-                          .reset()
-                          .bind(1, dwarfsrc)
-                          .step_ok_done();
-
-                        ps_upsert_s
-                          .reset()
-                          .bind(1, buildid)
-                          .bind(2, dwarfsrc)
-                          .bind(3, srps)
-                          .bind(4, sfs.st_mtime)
-                          .step_ok_done();
-
-			inc_metric("found_sourcerefs_total","source","files");
-                      }
-                  }
-
-                ps_scan_done
-                  .reset()
-                  .bind(1, rps)
-                  .bind(2, f->fts_statp->st_mtime)
-                  .bind(3, f->fts_statp->st_size)
-                  .step_ok_done();
-
-                if (verbose > 2)
-                  obatched(clog) << "recorded buildid=" << buildid << " file=" << rps
-                                 << " mtime=" << f->fts_statp->st_mtime << " atype="
-                                 << (executable_p ? "E" : "")
-                                 << (debuginfo_p ? "D" : "") << endl;
-              }
-              break;
+          char *srp = realpath(dwarfsrc.c_str(), NULL);
+          if (srp == NULL) // also if DWZ unresolved dwarfsrc=""
+            continue; // unresolvable files are not a serious problem
+          // throw libc_exception(errno, "fts/file realpath " + srcpath);
+          string srps = string(srp);
+          free (srp);
+
+          struct stat sfs;
+          rc = stat(srps.c_str(), &sfs);
+          if (rc != 0)
+            continue;
 
-            case FTS_ERR:
-            case FTS_NS:
-              throw libc_exception(f->fts_errno, string("fts/file traversal ") + string(f->fts_path));
+          if (verbose > 2)
+            obatched(clog) << "recorded buildid=" << buildid << " file=" << srps
+                           << " mtime=" << sfs.st_mtime
+                           << " as source " << dwarfsrc << endl;
 
-            default:
-            case FTS_SL: /* ignore symlinks; seen in non-L mode only */
-              break;
-            }
+          ps_upsert_files
+            .reset()
+            .bind(1, srps)
+            .step_ok_done();
 
-          if ((verbose && f->fts_info == FTS_DP) ||
-              (verbose > 1 && f->fts_info == FTS_F))
-            obatched(clog) << "fts/file traversing " << rps << ", scanned=" << fts_scanned
-                 << ", regex-skipped=" << fts_regex
-                 << ", cached=" << fts_cached << ", debuginfo=" << fts_debuginfo
-                 << ", executable=" << fts_executable << ", source=" << fts_sourcefiles << endl;
-        }
-      catch (const reportable_exception& e)
-        {
-          e.report(clog);
-        }
-    }
-  fts_close (fts);
+          // register the dwarfsrc name in the interning table too
+          ps_upsert_files
+            .reset()
+            .bind(1, dwarfsrc)
+            .step_ok_done();
 
-  gettimeofday (&tv_end, NULL);
-  double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
+          ps_upsert_s
+            .reset()
+            .bind(1, buildid)
+            .bind(2, dwarfsrc)
+            .bind(3, srps)
+            .bind(4, sfs.st_mtime)
+            .step_ok_done();
 
-  obatched(clog) << "fts/file traversed " << dir << " in " << deltas << "s, scanned=" << fts_scanned
-                 << ", regex-skipped=" << fts_regex
-                 << ", cached=" << fts_cached << ", debuginfo=" << fts_debuginfo
-                 << ", executable=" << fts_executable << ", source=" << fts_sourcefiles << endl;
-}
+          // PR25548: also store canonicalized source path
+          string dwarfsrc_canon = canon_pathname (dwarfsrc);
+          if (dwarfsrc_canon != dwarfsrc)
+            {
+              if (verbose > 3)
+                obatched(clog) << "canonicalized src=" << dwarfsrc << " alias=" << dwarfsrc_canon << endl;
 
+              ps_upsert_files
+                .reset()
+                .bind(1, dwarfsrc_canon)
+                .step_ok_done();
 
-static void*
-thread_main_scan_source_file_path (void* arg)
-{
-  string dir = string((const char*) arg);
+              ps_upsert_s
+                .reset()
+                .bind(1, buildid)
+                .bind(2, dwarfsrc_canon)
+                .bind(3, srps)
+                .bind(4, sfs.st_mtime)
+                .step_ok_done();
+            }
 
-  unsigned rescan_timer = 0;
-  sig_atomic_t forced_rescan_count = 0;
-  set_metric("thread_timer_max", "file", dir, rescan_s);
-  set_metric("thread_tid", "file", dir, tid());
-  while (! interrupted)
-    {
-      set_metric("thread_timer", "file", dir, rescan_timer);
-      set_metric("thread_forced_total", "file", dir, forced_rescan_count);
-      if (rescan_s && rescan_timer > rescan_s)
-        rescan_timer = 0;
-      if (sigusr1 != forced_rescan_count)
-        {
-          forced_rescan_count = sigusr1;
-          rescan_timer = 0;
+          inc_metric("found_sourcerefs_total","source","files");
         }
-      if (rescan_timer == 0)
-        try
-          {
-            set_metric("thread_working", "file", dir, time(NULL));
-            inc_metric("thread_work_total", "file", dir);
-            scan_source_file_path (dir);
-            set_metric("thread_working", "file", dir, 0);
-          }
-        catch (const sqlite_exception& e)
-          {
-            obatched(cerr) << e.message << endl;
-          }
-      sleep (1);
-      rescan_timer ++;
     }
 
-  return 0;
+  ps_scan_done
+    .reset()
+    .bind(1, rps)
+    .bind(2, st.st_mtime)
+    .bind(3, st.st_size)
+    .step_ok_done();
+
+  if (verbose > 2)
+    obatched(clog) << "recorded buildid=" << buildid << " file=" << rps
+                   << " mtime=" << st.st_mtime << " atype="
+                   << (executable_p ? "E" : "")
+                   << (debuginfo_p ? "D" : "") << endl;
 }
 
 
-////////////////////////////////////////////////////////////////////////
-
 
 
 
-// Analyze given *.rpm file of given age; record buildids / exec/debuginfo-ness of its
+// Analyze given archive file of given age; record buildids / exec/debuginfo-ness of its
 // constituent files with given upsert statements.
 static void
-rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_upsert_files,
-              sqlite_ps& ps_upsert_de, sqlite_ps& ps_upsert_sref, sqlite_ps& ps_upsert_sdef,
-              time_t mtime,
-              unsigned& fts_executable, unsigned& fts_debuginfo, unsigned& fts_sref, unsigned& fts_sdef,
-              bool& fts_sref_complete_p)
+archive_classify (const string& rps, string& archive_extension,
+                  sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_upsert_files,
+                  sqlite_ps& ps_upsert_de, sqlite_ps& ps_upsert_sref, sqlite_ps& ps_upsert_sdef,
+                  time_t mtime,
+                  unsigned& fts_executable, unsigned& fts_debuginfo, unsigned& fts_sref, unsigned& fts_sdef,
+                  bool& fts_sref_complete_p)
 {
-  string popen_cmd = string("rpm2cpio " + shell_escape(rps));
-  FILE* fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
-  if (fp == NULL)
-    throw libc_exception (errno, string("popen ") + popen_cmd);
-  defer_dtor<FILE*,int> fp_closer (fp, pclose);
+  string archive_decoder = "/dev/null";
+  for (auto&& arch : scan_archives)
+    if (string_endswith(rps, arch.first))
+      {
+        archive_extension = arch.first;
+        archive_decoder = arch.second;
+      }
+
+  FILE* fp;
+  defer_dtor<FILE*,int>::dtor_fn dfn;
+  if (archive_decoder != "cat")
+    {
+      string popen_cmd = archive_decoder + " " + shell_escape(rps);
+      fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
+      dfn = pclose;
+      if (fp == NULL)
+        throw libc_exception (errno, string("popen ") + popen_cmd);
+    }
+  else
+    {
+      fp = fopen (rps.c_str(), "r");
+      dfn = fclose;
+      if (fp == NULL)
+        throw libc_exception (errno, string("fopen ") + rps);
+    }
+  defer_dtor<FILE*,int> fp_closer (fp, dfn);
 
   struct archive *a;
   a = archive_read_new();
@@ -1879,19 +2342,19 @@ rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_up
     throw archive_exception("cannot create archive reader");
   defer_dtor<struct archive*,int> archive_closer (a, archive_read_free);
 
-  int rc = archive_read_support_format_cpio(a);
+  int rc = archive_read_support_format_all(a);
   if (rc != ARCHIVE_OK)
-    throw archive_exception(a, "cannot select cpio format");
+    throw archive_exception(a, "cannot select all formats");
   rc = archive_read_support_filter_all(a);
   if (rc != ARCHIVE_OK)
     throw archive_exception(a, "cannot select all filters");
 
   rc = archive_read_open_FILE (a, fp);
   if (rc != ARCHIVE_OK)
-    throw archive_exception(a, "cannot open archive from rpm2cpio pipe");
+    throw archive_exception(a, "cannot open archive from pipe");
 
   if (verbose > 3)
-    obatched(clog) << "rpm2cpio|libarchive scanning " << rps << endl;
+    obatched(clog) << "libarchive scanning " << rps << endl;
 
   while(1) // parse cpio archive entries
     {
@@ -1905,17 +2368,14 @@ rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_up
           if (! S_ISREG(archive_entry_mode (e))) // skip non-files completely
             continue;
 
-          string fn = archive_entry_pathname (e);
-          if (fn.size() > 1 && fn[0] == '.')
-            fn = fn.substr(1); // trim off the leading '.'
+          string fn = canonicalized_archive_entry_pathname (e);
 
           if (verbose > 3)
-            obatched(clog) << "rpm2cpio|libarchive checking " << fn << endl;
+            obatched(clog) << "libarchive checking " << fn << endl;
 
           // extract this file to a temporary file
-          const char *tmpdir_env = getenv ("TMPDIR") ?: "/tmp";
           char* tmppath = NULL;
-          rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir_env);
+          rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir.c_str());
           if (rc < 0)
             throw libc_exception (ENOMEM, "cannot allocate tmppath");
           defer_dtor<void*,void> tmmpath_freer (tmppath, free);
@@ -1978,6 +2438,26 @@ rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_up
                     .bind(2, s)
                     .step_ok_done();
 
+                  // PR25548: also store canonicalized source path
+                  const string& dwarfsrc = s;
+                  string dwarfsrc_canon = canon_pathname (dwarfsrc);
+                  if (dwarfsrc_canon != dwarfsrc)
+                    {
+                      if (verbose > 3)
+                        obatched(clog) << "canonicalized src=" << dwarfsrc << " alias=" << dwarfsrc_canon << endl;
+
+                      ps_upsert_files
+                        .reset()
+                        .bind(1, dwarfsrc_canon)
+                        .step_ok_done();
+
+                      ps_upsert_sref
+                        .reset()
+                        .bind(1, buildid)
+                        .bind(2, dwarfsrc_canon)
+                        .step_ok_done();
+                    }
+
                   fts_sref ++;
                 }
             }
@@ -2027,250 +2507,351 @@ rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_up
 
 
 
-// scan for *.rpm files
+// scan for archive files such as .rpm
 static void
-scan_source_rpm_path (const string& dir)
+scan_archive_file (const string& rps, const stat_t& st,
+                   sqlite_ps& ps_upsert_buildids,
+                   sqlite_ps& ps_upsert_files,
+                   sqlite_ps& ps_upsert_de,
+                   sqlite_ps& ps_upsert_sref,
+                   sqlite_ps& ps_upsert_sdef,
+                   sqlite_ps& ps_query,
+                   sqlite_ps& ps_scan_done,
+                   unsigned& fts_cached,
+                   unsigned& fts_executable,
+                   unsigned& fts_debuginfo,
+                   unsigned& fts_sref,
+                   unsigned& fts_sdef)
+{
+  /* See if we know of it already. */
+  int rc = ps_query
+    .reset()
+    .bind(1, rps)
+    .bind(2, st.st_mtime)
+    .step();
+  ps_query.reset();
+  if (rc == SQLITE_ROW) // i.e., a result, as opposed to DONE (no results)
+    // no need to recheck a file/version we already know
+    // specifically, no need to parse this archive again, since we already have
+    // it as a D or E or S record,
+    // (so is stored with buildid=NULL)
+    {
+      fts_cached ++;
+      return;
+    }
+
+  // intern the archive file name
+  ps_upsert_files
+    .reset()
+    .bind(1, rps)
+    .step_ok_done();
+
+  // extract the archive contents
+  unsigned my_fts_executable = 0, my_fts_debuginfo = 0, my_fts_sref = 0, my_fts_sdef = 0;
+  bool my_fts_sref_complete_p = true;
+  try
+    {
+      string archive_extension;
+      archive_classify (rps, archive_extension,
+                        ps_upsert_buildids, ps_upsert_files,
+                        ps_upsert_de, ps_upsert_sref, ps_upsert_sdef, // dalt
+                        st.st_mtime,
+                        my_fts_executable, my_fts_debuginfo, my_fts_sref, my_fts_sdef,
+                        my_fts_sref_complete_p);
+      inc_metric ("scanned_total","source",archive_extension + " archive");
+      add_metric("found_debuginfo_total","source",archive_extension + " archive",
+                 my_fts_debuginfo);
+      add_metric("found_executable_total","source",archive_extension + " archive",
+                 my_fts_executable);
+      add_metric("found_sourcerefs_total","source",archive_extension + " archive",
+                 my_fts_sref);
+    }
+  catch (const reportable_exception& e)
+    {
+      e.report(clog);
+    }
+
+  if (verbose > 2)
+    obatched(clog) << "scanned archive=" << rps
+                   << " mtime=" << st.st_mtime
+                   << " executables=" << my_fts_executable
+                   << " debuginfos=" << my_fts_debuginfo
+                   << " srefs=" << my_fts_sref
+                   << " sdefs=" << my_fts_sdef
+                   << endl;
+
+  fts_executable += my_fts_executable;
+  fts_debuginfo += my_fts_debuginfo;
+  fts_sref += my_fts_sref;
+  fts_sdef += my_fts_sdef;
+
+  if (my_fts_sref_complete_p) // leave incomplete?
+    ps_scan_done
+      .reset()
+      .bind(1, rps)
+      .bind(2, st.st_mtime)
+      .bind(3, st.st_size)
+      .step_ok_done();
+}
+
+
+
+////////////////////////////////////////////////////////////////////////
+
+
+
+// The thread that consumes file names off of the scanq.  We hold
+// the persistent sqlite_ps's at this level and delegate file/archive
+// scanning to other functions.
+static void*
+thread_main_scanner (void* arg)
 {
-  obatched(clog) << "fts/rpm traversing " << dir << endl;
+  (void) arg;
+
+  // all the prepared statements fit to use, the _f_ set:
+  sqlite_ps ps_f_upsert_buildids (db, "file-buildids-intern", "insert or ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
+  sqlite_ps ps_f_upsert_files (db, "file-files-intern", "insert or ignore into " BUILDIDS "_files VALUES (NULL, ?);");
+  sqlite_ps ps_f_upsert_de (db, "file-de-upsert",
+                          "insert or ignore into " BUILDIDS "_f_de "
+                          "(buildid, debuginfo_p, executable_p, file, mtime) "
+                          "values ((select id from " BUILDIDS "_buildids where hex = ?),"
+                          "        ?,?,"
+                          "        (select id from " BUILDIDS "_files where name = ?), ?);");
+  sqlite_ps ps_f_upsert_s (db, "file-s-upsert",
+                         "insert or ignore into " BUILDIDS "_f_s "
+                         "(buildid, artifactsrc, file, mtime) "
+                         "values ((select id from " BUILDIDS "_buildids where hex = ?),"
+                         "        (select id from " BUILDIDS "_files where name = ?),"
+                         "        (select id from " BUILDIDS "_files where name = ?),"
+                         "        ?);");
+  sqlite_ps ps_f_query (db, "file-negativehit-find",
+                        "select 1 from " BUILDIDS "_file_mtime_scanned where sourcetype = 'F' "
+                        "and file = (select id from " BUILDIDS "_files where name = ?) and mtime = ?;");
+  sqlite_ps ps_f_scan_done (db, "file-scanned",
+                          "insert or ignore into " BUILDIDS "_file_mtime_scanned (sourcetype, file, mtime, size)"
+                          "values ('F', (select id from " BUILDIDS "_files where name = ?), ?, ?);");
 
-  sqlite_ps ps_upsert_buildids (db, "rpm-buildid-intern", "insert or ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
-  sqlite_ps ps_upsert_files (db, "rpm-file-intern", "insert or ignore into " BUILDIDS "_files VALUES (NULL, ?);");
-  sqlite_ps ps_upsert_de (db, "rpm-de-insert",
+  // and now for the _r_ set
+  sqlite_ps ps_r_upsert_buildids (db, "rpm-buildid-intern", "insert or ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
+  sqlite_ps ps_r_upsert_files (db, "rpm-file-intern", "insert or ignore into " BUILDIDS "_files VALUES (NULL, ?);");
+  sqlite_ps ps_r_upsert_de (db, "rpm-de-insert",
                           "insert or ignore into " BUILDIDS "_r_de (buildid, debuginfo_p, executable_p, file, mtime, content) values ("
                           "(select id from " BUILDIDS "_buildids where hex = ?), ?, ?, "
                           "(select id from " BUILDIDS "_files where name = ?), ?, "
                           "(select id from " BUILDIDS "_files where name = ?));");
-  sqlite_ps ps_upsert_sref (db, "rpm-sref-insert",
+  sqlite_ps ps_r_upsert_sref (db, "rpm-sref-insert",
                             "insert or ignore into " BUILDIDS "_r_sref (buildid, artifactsrc) values ("
                             "(select id from " BUILDIDS "_buildids where hex = ?), "
                             "(select id from " BUILDIDS "_files where name = ?));");
-  sqlite_ps ps_upsert_sdef (db, "rpm-sdef-insert",
+  sqlite_ps ps_r_upsert_sdef (db, "rpm-sdef-insert",
                             "insert or ignore into " BUILDIDS "_r_sdef (file, mtime, content) values ("
                             "(select id from " BUILDIDS "_files where name = ?), ?,"
                             "(select id from " BUILDIDS "_files where name = ?));");
-  sqlite_ps ps_query (db, "rpm-negativehit-query",
+  sqlite_ps ps_r_query (db, "rpm-negativehit-query",
                       "select 1 from " BUILDIDS "_file_mtime_scanned where "
                       "sourcetype = 'R' and file = (select id from " BUILDIDS "_files where name = ?) and mtime = ?;");
-  sqlite_ps ps_scan_done (db, "rpm-scanned",
+  sqlite_ps ps_r_scan_done (db, "rpm-scanned",
                           "insert or ignore into " BUILDIDS "_file_mtime_scanned (sourcetype, file, mtime, size)"
                           "values ('R', (select id from " BUILDIDS "_files where name = ?), ?, ?);");
 
-  char * const dirs[] = { (char*) dir.c_str(), NULL };
 
-  struct timeval tv_start, tv_end;
-  gettimeofday (&tv_start, NULL);
-  unsigned fts_scanned=0, fts_regex=0, fts_cached=0, fts_debuginfo=0;
-  unsigned fts_executable=0, fts_rpm = 0, fts_sref=0, fts_sdef=0;
+  unsigned fts_cached = 0, fts_executable = 0, fts_debuginfo = 0, fts_sourcefiles = 0;
+  unsigned fts_sref = 0, fts_sdef = 0;
 
-  FTS *fts = fts_open (dirs,
-                       (traverse_logical ? FTS_LOGICAL : FTS_PHYSICAL|FTS_XDEV)
-                       | FTS_NOCHDIR /* multithreaded */,
-                       NULL);
-  if (fts == NULL)
-    {
-      obatched(cerr) << "cannot fts_open " << dir << endl;
-      return;
-    }
-
-  FTSENT *f;
-  while ((f = fts_read (fts)) != NULL)
+  add_metric("thread_count", "role", "scan", 1);
+  add_metric("thread_busy", "role", "scan", 1);
+  while (! interrupted)
     {
-      semaphore_borrower handle_one_file (scan_concurrency_sem);
-
-      fts_scanned ++;
-      if (interrupted)
-        break;
+      scan_payload p;
 
-      if (verbose > 2)
-        obatched(clog) << "fts/rpm traversing " << f->fts_path << endl;
+      add_metric("thread_busy", "role", "scan", -1);
+      bool gotone = scanq.wait_front(p);
+      add_metric("thread_busy", "role", "scan", 1);
+      if (! gotone) continue; // or break
 
       try
         {
-          /* Found a file.  Convert it to an absolute path, so
-             the buildid database does not have relative path
-             names that are unresolvable from a subsequent run
-             in a different cwd. */
-          char *rp = realpath(f->fts_path, NULL);
-          if (rp == NULL)
-            continue; // ignore dangling symlink or such
-          string rps = string(rp);
-          free (rp);
-
-          bool ri = !regexec (&file_include_regex, rps.c_str(), 0, 0, 0);
-          bool rx = !regexec (&file_exclude_regex, rps.c_str(), 0, 0, 0);
-          if (!ri || rx)
-            {
-              if (verbose > 3)
-                obatched(clog) << "fts/rpm skipped by regex " << (!ri ? "I" : "") << (rx ? "X" : "") << endl;
-              fts_regex ++;
-              continue;
-            }
+          bool scan_archive = false;
+          for (auto&& arch : scan_archives)
+            if (string_endswith(p.first, arch.first))
+              scan_archive = true;
+
+          if (scan_archive)
+            scan_archive_file (p.first, p.second,
+                               ps_r_upsert_buildids,
+                               ps_r_upsert_files,
+                               ps_r_upsert_de,
+                               ps_r_upsert_sref,
+                               ps_r_upsert_sdef,
+                               ps_r_query,
+                               ps_r_scan_done,
+                               fts_cached,
+                               fts_executable,
+                               fts_debuginfo,
+                               fts_sref,
+                               fts_sdef);
+
+          if (scan_files) // NB: maybe "else if" ?
+            scan_source_file (p.first, p.second,
+                              ps_f_upsert_buildids,
+                              ps_f_upsert_files,
+                              ps_f_upsert_de,
+                              ps_f_upsert_s,
+                              ps_f_query,
+                              ps_f_scan_done,
+                              fts_cached, fts_executable, fts_debuginfo, fts_sourcefiles);
+        }
+      catch (const reportable_exception& e)
+        {
+          e.report(cerr);
+        }
 
-          switch (f->fts_info)
-            {
-            case FTS_D:
-              break;
+      inc_metric("thread_work_total", "role","scan");
+    }
 
-            case FTS_DP:
-              break;
+  add_metric("thread_busy", "role", "scan", -1);
+  return 0;
+}
 
-            case FTS_F:
-              {
-                // heuristic: reject if file name does not end with ".rpm"
-                // (alternative: try opening with librpm etc., caching)
-                string suffix = ".rpm";
-                if (rps.size() < suffix.size() ||
-                    rps.substr(rps.size()-suffix.size()) != suffix)
-                  continue;
-                fts_rpm ++;
-
-                /* See if we know of it already. */
-                int rc = ps_query
-                  .reset()
-                  .bind(1, rps)
-                  .bind(2, f->fts_statp->st_mtime)
-                  .step();
-                ps_query.reset();
-                if (rc == SQLITE_ROW) // i.e., a result, as opposed to DONE (no results)
-                  // no need to recheck a file/version we already know
-                  // specifically, no need to parse this rpm again, since we already have
-                  // it as a D or E or S record,
-                  // (so is stored with buildid=NULL)
-                  {
-                    fts_cached ++;
-                    continue;
-                  }
-
-                // intern the rpm file name
-                ps_upsert_files
-                  .reset()
-                  .bind(1, rps)
-                  .step_ok_done();
-
-                // extract the rpm contents via popen("rpm2cpio") | libarchive | loop-of-elf_classify()
-                unsigned my_fts_executable = 0, my_fts_debuginfo = 0, my_fts_sref = 0, my_fts_sdef = 0;
-                bool my_fts_sref_complete_p = true;
-                try
-                  {
-                    rpm_classify (rps,
-                                  ps_upsert_buildids, ps_upsert_files,
-                                  ps_upsert_de, ps_upsert_sref, ps_upsert_sdef, // dalt
-                                  f->fts_statp->st_mtime,
-                                  my_fts_executable, my_fts_debuginfo, my_fts_sref, my_fts_sdef,
-                                  my_fts_sref_complete_p);
-                    inc_metric ("scanned_total","source","rpm");
-                    add_metric("found_debuginfo_total","source","rpm",
-                               my_fts_debuginfo);
-                    add_metric("found_executable_total","source","rpm",
-                               my_fts_executable);
-                    add_metric("found_sourcerefs_total","source","rpm",
-                               my_fts_sref);
-                  }
-                catch (const reportable_exception& e)
-                  {
-                    e.report(clog);
-                  }
-
-                if (verbose > 2)
-                  obatched(clog) << "scanned rpm=" << rps
-                                 << " mtime=" << f->fts_statp->st_mtime
-                                 << " executables=" << my_fts_executable
-                                 << " debuginfos=" << my_fts_debuginfo
-                                 << " srefs=" << my_fts_sref
-                                 << " sdefs=" << my_fts_sdef
-                                 << endl;
- 
-                fts_executable += my_fts_executable;
-                fts_debuginfo += my_fts_debuginfo;
-                fts_sref += my_fts_sref;
-                fts_sdef += my_fts_sdef;
-
-                if (my_fts_sref_complete_p) // leave incomplete?
-                  ps_scan_done
-                    .reset()
-                    .bind(1, rps)
-                    .bind(2, f->fts_statp->st_mtime)
-                    .bind(3, f->fts_statp->st_size)
-                    .step_ok_done();
-              }
-              break;
 
-            case FTS_ERR:
-            case FTS_NS:
-              throw libc_exception(f->fts_errno, string("fts/rpm traversal ") + string(f->fts_path));
 
-            default:
-            case FTS_SL: /* ignore symlinks; seen in non-L mode only */
-              break;
-            }
+// The thread that traverses all the source_paths and enqueues all the
+// matching files into the file/archive scan queue.
+static void
+scan_source_paths()
+{
+  // NB: fedora 31 glibc/fts(3) crashes inside fts_read() on empty
+  // path list.
+  if (source_paths.empty())
+    return;
 
-          if ((verbose && f->fts_info == FTS_DP) ||
-              (verbose > 1 && f->fts_info == FTS_F))
-            obatched(clog) << "fts/rpm traversing " << rps << ", scanned=" << fts_scanned
-                           << ", regex-skipped=" << fts_regex
-                           << ", rpm=" << fts_rpm << ", cached=" << fts_cached << ", debuginfo=" << fts_debuginfo
-                           << ", executable=" << fts_executable
-                           << ", sourcerefs=" << fts_sref << ", sourcedefs=" << fts_sdef << endl;
-        }
-      catch (const reportable_exception& e)
+  // Turn the source_paths into an fts(3)-compatible char**.  Since
+  // source_paths[] does not change after argv processing, the
+  // c_str()'s are safe to keep around awile.
+  vector<const char *> sps;
+  for (auto&& sp: source_paths)
+    sps.push_back(sp.c_str());
+  sps.push_back(NULL);
+
+  FTS *fts = fts_open ((char * const *)sps.data(),
+                      (traverse_logical ? FTS_LOGICAL : FTS_PHYSICAL|FTS_XDEV)
+                      | FTS_NOCHDIR /* multithreaded */,
+                      NULL);
+  if (fts == NULL)
+    throw libc_exception(errno, "cannot fts_open");
+  defer_dtor<FTS*,int> fts_cleanup (fts, fts_close);
+
+  struct timeval tv_start, tv_end;
+  gettimeofday (&tv_start, NULL);
+  unsigned fts_scanned = 0, fts_regex = 0;
+
+  FTSENT *f;
+  while ((f = fts_read (fts)) != NULL)
+  {
+    if (interrupted) break;
+
+    fts_scanned ++;
+
+    if (verbose > 2)
+      obatched(clog) << "fts traversing " << f->fts_path << endl;
+
+    /* Found a file.  Convert it to an absolute path, so
+       the buildid database does not have relative path
+       names that are unresolvable from a subsequent run
+       in a different cwd. */
+    char *rp = realpath(f->fts_path, NULL);
+    if (rp == NULL)
+      continue; // ignore dangling symlink or such
+    string rps = string(rp);
+    free (rp);
+
+    bool ri = !regexec (&file_include_regex, rps.c_str(), 0, 0, 0);
+    bool rx = !regexec (&file_exclude_regex, rps.c_str(), 0, 0, 0);
+    if (!ri || rx)
+      {
+        if (verbose > 3)
+          obatched(clog) << "fts skipped by regex " << (!ri ? "I" : "") << (rx ? "X" : "") << endl;
+        fts_regex ++;
+        continue;
+      }
+
+    switch (f->fts_info)
+      {
+      case FTS_F:
+        scanq.push_back (make_pair(rps, *f->fts_statp));
+        break;
+
+      case FTS_ERR:
+      case FTS_NS:
+        // report on some types of errors because they may reflect fixable misconfiguration
         {
-          e.report(clog);
+          auto x = libc_exception(f->fts_errno, string("fts traversal ") + string(f->fts_path));
+          x.report(cerr);
         }
-    }
-  fts_close (fts);
+        break;
 
+      default:
+        ;
+        /* ignore */
+      }
+  }
   gettimeofday (&tv_end, NULL);
   double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
 
-  obatched(clog) << "fts/rpm traversed " << dir << " in " << deltas << "s, scanned=" << fts_scanned
-                 << ", regex-skipped=" << fts_regex
-                 << ", rpm=" << fts_rpm << ", cached=" << fts_cached << ", debuginfo=" << fts_debuginfo
-                 << ", executable=" << fts_executable
-                 << ", sourcerefs=" << fts_sref << ", sourcedefs=" << fts_sdef << endl;
+  obatched(clog) << "fts traversed source paths in " << deltas << "s, scanned=" << fts_scanned
+                 << ", regex-skipped=" << fts_regex << endl;
 }
 
 
-
 static void*
-thread_main_scan_source_rpm_path (void* arg)
+thread_main_fts_source_paths (void* arg)
 {
-  string dir = string((const char*) arg);
+  (void) arg; // ignore; we operate on global data
 
-  unsigned rescan_timer = 0;
   sig_atomic_t forced_rescan_count = 0;
-  set_metric("thread_timer_max", "rpm", dir, rescan_s);
-  set_metric("thread_tid", "rpm", dir, tid());
+  set_metric("thread_tid", "role","traverse", tid());
+  add_metric("thread_count", "role", "traverse", 1);
+
+  time_t last_rescan = 0;
+
   while (! interrupted)
     {
-      set_metric("thread_timer", "rpm", dir, rescan_timer);
-      set_metric("thread_forced_total", "rpm", dir, forced_rescan_count);
-      if (rescan_s && rescan_timer > rescan_s)
-        rescan_timer = 0;
+      sleep (1);
+      scanq.wait_idle(); // don't start a new traversal while scanners haven't finished the job
+      scanq.done_idle(); // release the hounds
+      if (interrupted) break;
+
+      time_t now = time(NULL);
+      bool rescan_now = false;
+      if (last_rescan == 0) // at least one initial rescan is documented even for -t0
+        rescan_now = true;
+      if (rescan_s > 0 && (long)now > (long)(last_rescan + rescan_s))
+        rescan_now = true;
       if (sigusr1 != forced_rescan_count)
         {
           forced_rescan_count = sigusr1;
-          rescan_timer = 0;
+          rescan_now = true;
         }
-      if (rescan_timer == 0)
+      if (rescan_now)
         try
           {
-            set_metric("thread_working", "rpm", dir, time(NULL));
-            inc_metric("thread_work_total", "rpm", dir);
-            scan_source_rpm_path (dir);
-            set_metric("thread_working", "rpm", dir, 0);
+            set_metric("thread_busy", "role","traverse", 1);
+            scan_source_paths();
+            last_rescan = time(NULL); // NB: now was before scanning
+            inc_metric("thread_work_total", "role","traverse");
+            set_metric("thread_busy", "role","traverse", 0);
           }
-        catch (const sqlite_exception& e)
+        catch (const reportable_exception& e)
           {
-            obatched(cerr) << e.message << endl;
+            e.report(cerr);
           }
-      sleep (1);
-      rescan_timer ++;
     }
 
   return 0;
 }
 
 
+
 ////////////////////////////////////////////////////////////////////////
 
 static void
@@ -2359,6 +2940,9 @@ void groom()
 
   sqlite3_db_release_memory(db); // shrink the process if possible
 
+  fdcache.limit(0,0); // release the fdcache contents
+  fdcache.limit(fdcache_fds,fdcache_mbs); // restore status quo parameters
+
   gettimeofday (&tv_end, NULL);
   double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
 
@@ -2369,35 +2953,44 @@ void groom()
 static void*
 thread_main_groom (void* /*arg*/)
 {
-  unsigned groom_timer = 0;
   sig_atomic_t forced_groom_count = 0;
-  set_metric("thread_timer_max", "role", "groom", groom_s);
   set_metric("thread_tid", "role", "groom", tid());
-  while (! interrupted)
+  add_metric("thread_count", "role", "groom", 1);
+
+  time_t last_groom = 0;
+
+  while (1)
     {
-      set_metric("thread_timer", "role", "groom", groom_timer);
-      set_metric("thread_forced_total", "role", "groom", forced_groom_count);      
-      if (groom_s && groom_timer > groom_s)
-        groom_timer = 0;
+      sleep (1);
+      scanq.wait_idle(); // PR25394: block scanners during grooming!
+      if (interrupted) break;
+
+      time_t now = time(NULL);
+      bool groom_now = false;
+      if (last_groom == 0) // at least one initial groom is documented even for -g0
+        groom_now = true;
+      if (groom_s > 0 && (long)now > (long)(last_groom + groom_s))
+        groom_now = true;
       if (sigusr2 != forced_groom_count)
         {
           forced_groom_count = sigusr2;
-          groom_timer = 0;
+          groom_now = true;
         }
-      if (groom_timer == 0)
+      if (groom_now)
         try
           {
-            set_metric("thread_working", "role", "groom", time(NULL));
-            inc_metric("thread_work_total", "role", "groom");
+            set_metric("thread_busy", "role", "groom", 1);
             groom ();
-            set_metric("thread_working", "role", "groom", 0);
+            last_groom = time(NULL); // NB: now was before grooming
+            inc_metric("thread_work_total", "role", "groom");
+            set_metric("thread_busy", "role", "groom", 0);
           }
         catch (const sqlite_exception& e)
           {
             obatched(cerr) << e.message << endl;
           }
-      sleep (1);
-      groom_timer ++;
+
+      scanq.done_idle();
     }
 
   return 0;
@@ -2473,6 +3066,8 @@ main (int argc, char *argv[])
   /* Tell the library which version we are expecting.  */
   elf_version (EV_CURRENT);
 
+  tmpdir = string(getenv("TMPDIR") ?: "/tmp");
+
   /* Set computed default values. */
   db_path = string(getenv("HOME") ?: "/") + string("/.debuginfod.sqlite"); /* XDG? */
   int rc = regcomp (& file_include_regex, ".*", REG_EXTENDED|REG_NOSUB); // match everything
@@ -2482,6 +3077,16 @@ main (int argc, char *argv[])
   if (rc != 0)
     error (EXIT_FAILURE, 0, "regcomp failure: %d", rc);
 
+  // default parameters for fdcache are computed from system stats
+  struct statfs sfs;
+  rc = statfs(tmpdir.c_str(), &sfs);
+  if (rc < 0)
+    fdcache_mbs = 1024; // 1 gigabyte
+  else
+    fdcache_mbs = sfs.f_bavail * sfs.f_bsize / 1024 / 1024 / 4; // 25% of free space
+  fdcache_prefetch = 64; // guesstimate storage is this much less costly than re-decompression
+  fdcache_fds = (concurrency + fdcache_prefetch) * 2;
+
   /* Parse and process arguments.  */
   int remaining;
   argp_program_version_hook = print_version; // this works
@@ -2490,8 +3095,10 @@ main (int argc, char *argv[])
       error (EXIT_FAILURE, 0,
              "unexpected argument: %s", argv[remaining]);
 
-  if (!scan_rpms && !scan_files && source_paths.size()>0)
-    obatched(clog) << "warning: without -F and/or -R, ignoring PATHs" << endl;
+  if (scan_archives.size()==0 && !scan_files && source_paths.size()>0)
+    obatched(clog) << "warning: without -F -R -U -Z, ignoring PATHs" << endl;
+
+  fdcache.limit(fdcache_fds, fdcache_mbs);
 
   (void) signal (SIGPIPE, SIG_IGN); // microhttpd can generate it incidentally, ignore
   (void) signal (SIGINT, signal_handler); // ^C
@@ -2500,9 +3107,6 @@ main (int argc, char *argv[])
   (void) signal (SIGUSR1, sigusr1_handler); // end-user
   (void) signal (SIGUSR2, sigusr2_handler); // end-user
 
-  // do this before any threads start
-  scan_concurrency_sem = new semaphore(concurrency);
-
   /* Get database ready. */
   rc = sqlite3_open_v2 (db_path.c_str(), &db, (SQLITE_OPEN_READWRITE
                                                |SQLITE_OPEN_CREATE
@@ -2611,64 +3215,68 @@ main (int argc, char *argv[])
   if (maxigroom)
     obatched(clog) << "maxigroomed database" << endl;
 
-
   obatched(clog) << "search concurrency " << concurrency << endl;
   obatched(clog) << "rescan time " << rescan_s << endl;
+  obatched(clog) << "fdcache fds " << fdcache_fds << endl;
+  obatched(clog) << "fdcache mbs " << fdcache_mbs << endl;
+  obatched(clog) << "fdcache prefetch " << fdcache_prefetch << endl;
+  obatched(clog) << "fdcache tmpdir " << tmpdir << endl;
   obatched(clog) << "groom time " << groom_s << endl;
+  if (scan_archives.size()>0)
+    {
+      obatched ob(clog);
+      auto& o = ob << "scanning archive types ";
+      for (auto&& arch : scan_archives)
+	o << arch.first << "(" << arch.second << ") ";
+      o << endl;
+    }
   const char* du = getenv(DEBUGINFOD_URLS_ENV_VAR);
   if (du && du[0] != '\0') // set to non-empty string?
     obatched(clog) << "upstream debuginfod servers: " << du << endl;
 
-  vector<pthread_t> source_file_scanner_threads;
-  vector<pthread_t> source_rpm_scanner_threads;
-  pthread_t groom_thread;
+  vector<pthread_t> all_threads;
 
-  rc = pthread_create (& groom_thread, NULL, thread_main_groom, NULL);
+  pthread_t pt;
+  rc = pthread_create (& pt, NULL, thread_main_groom, NULL);
   if (rc < 0)
     error (0, 0, "warning: cannot spawn thread (%d) to groom database\n", rc);
- 
-  if (scan_files) for (auto&& it : source_paths)
-    {
-      pthread_t pt;
-      rc = pthread_create (& pt, NULL, thread_main_scan_source_file_path, (void*) it.c_str());
-      if (rc < 0)
-        error (0, 0, "warning: cannot spawn thread (%d) to scan source files %s\n", rc, it.c_str());
-      else
-        source_file_scanner_threads.push_back(pt);
-    }
+  else
+    all_threads.push_back(pt);
 
-  if (scan_rpms) for (auto&& it : source_paths)
+  if (scan_files || scan_archives.size() > 0)
     {
-      pthread_t pt;
-      rc = pthread_create (& pt, NULL, thread_main_scan_source_rpm_path, (void*) it.c_str());
+      pthread_create (& pt, NULL, thread_main_fts_source_paths, NULL);
       if (rc < 0)
-        error (0, 0, "warning: cannot spawn thread (%d) to scan source rpms %s\n", rc, it.c_str());
-      else
-        source_rpm_scanner_threads.push_back(pt);
+        error (0, 0, "warning: cannot spawn thread (%d) to traverse source paths\n", rc);
+      all_threads.push_back(pt);
+      for (unsigned i=0; i<concurrency; i++)
+        {
+          pthread_create (& pt, NULL, thread_main_scanner, NULL);
+          if (rc < 0)
+            error (0, 0, "warning: cannot spawn thread (%d) to scan source files / archives\n", rc);
+          all_threads.push_back(pt);
+        }
     }
 
   /* Trivial main loop! */
   set_metric("ready", 1);
   while (! interrupted)
     pause ();
+  scanq.nuke(); // wake up any remaining scanq-related threads, let them die
   set_metric("ready", 0);
 
   if (verbose)
     obatched(clog) << "stopping" << endl;
 
-  /* Join any source scanning threads. */
-  for (auto&& it : source_file_scanner_threads)
-    pthread_join (it, NULL);
-  for (auto&& it : source_rpm_scanner_threads)
+  /* Join all our threads. */
+  for (auto&& it : all_threads)
     pthread_join (it, NULL);
-  pthread_join (groom_thread, NULL);
-  
+
   /* Stop all the web service threads. */
   if (d4) MHD_stop_daemon (d4);
   if (d6) MHD_stop_daemon (d6);
 
   /* With all threads known dead, we can clean up the global resources. */
-  delete scan_concurrency_sem;
   rc = sqlite3_exec (db, DEBUGINFOD_SQLITE_CLEANUP_DDL, NULL, NULL, NULL);
   if (rc != SQLITE_OK)
     {
diff --git a/debuginfod/debuginfod.h b/debuginfod/debuginfod.h
index 6b1b1cc3..8d90838b 100644
--- a/debuginfod/debuginfod.h
+++ b/debuginfod/debuginfod.h
@@ -1,5 +1,5 @@
 /* External declarations for the libdebuginfod client library.
-   Copyright (C) 2019 Red Hat, Inc.
+   Copyright (C) 2019-2020 Red Hat, Inc.
    This file is part of elfutils.
 
    This file is free software; you can redistribute it and/or modify
@@ -33,6 +33,7 @@
 #define DEBUGINFOD_URLS_ENV_VAR "DEBUGINFOD_URLS"
 #define DEBUGINFOD_CACHE_PATH_ENV_VAR "DEBUGINFOD_CACHE_PATH"
 #define DEBUGINFOD_TIMEOUT_ENV_VAR "DEBUGINFOD_TIMEOUT"
+#define DEBUGINFOD_PROGRESS_ENV_VAR "DEBUGINFOD_PROGRESS"
 
 /* Handle for debuginfod-client connection.  */
 typedef struct debuginfod_client debuginfod_client;
@@ -47,13 +48,13 @@ debuginfod_client *debuginfod_begin (void);
 /* Query the urls contained in $DEBUGINFOD_URLS for a file with
    the specified type and build id.  If build_id_len == 0, the
    build_id is supplied as a lowercase hexadecimal string; otherwise
-   it is a binary blob of given legnth.
+   it is a binary blob of given length.
 
    If successful, return a file descriptor to the target, otherwise
    return a posix error code.  If successful, set *path to a
    strdup'd copy of the name of the same file in the cache.
    Caller must free() it later. */
-  
+
 int debuginfod_find_debuginfo (debuginfod_client *client,
 			       const unsigned char *build_id,
                                int build_id_len,
@@ -74,6 +75,18 @@ typedef int (*debuginfod_progressfn_t)(debuginfod_client *c, long a, long b);
 void debuginfod_set_progressfn(debuginfod_client *c,
 			       debuginfod_progressfn_t fn);
 
+/* Set the user parameter.  */
+void debuginfod_set_user_data (debuginfod_client *client, void *value);
+
+/* Get the user parameter.  */
+void* debuginfod_get_user_data (debuginfod_client *client);
+
+/* Get the current or last active URL, if known.  */
+const char* debuginfod_get_url (debuginfod_client *client);
+
+/* Add an outgoing HTTP request  "Header: Value".  Copies string.  */
+int debuginfod_add_http_header (debuginfod_client *client, const char* header);
+
 /* Release debuginfod client connection context handle.  */
 void debuginfod_end (debuginfod_client *client);
 
diff --git a/debuginfod/libdebuginfod.map b/debuginfod/libdebuginfod.map
index 0d26f93e..b8edfb01 100644
--- a/debuginfod/libdebuginfod.map
+++ b/debuginfod/libdebuginfod.map
@@ -8,3 +8,10 @@ ELFUTILS_0.178 {
   debuginfod_find_source;
   debuginfod_set_progressfn;
 } ELFUTILS_0;
+ELFUTILS_0.179 {
+  global:
+  debuginfod_set_user_data;
+  debuginfod_get_user_data;
+  debuginfod_get_url;
+  debuginfod_add_http_header;
+};