summaryrefslogtreecommitdiffstats
path: root/debuginfod/debuginfod.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'debuginfod/debuginfod.cxx')
-rw-r--r--debuginfod/debuginfod.cxx1828
1 files changed, 1218 insertions, 610 deletions
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index aa7ffcf6..76f1fa52 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -1,5 +1,5 @@
/* Debuginfo-over-http server.
- Copyright (C) 2019 Red Hat, Inc.
+ Copyright (C) 2019-2020 Red Hat, Inc.
This file is part of elfutils.
This file is free software; you can redistribute it and/or modify
@@ -52,6 +52,7 @@ extern "C" {
#include <signal.h>
#include <sys/stat.h>
#include <sys/time.h>
+#include <sys/vfs.h>
#include <unistd.h>
#include <fcntl.h>
#include <netdb.h>
@@ -79,6 +80,7 @@ extern "C" {
#include <ostream>
#include <sstream>
#include <mutex>
+#include <deque>
#include <condition_variable>
#include <thread>
// #include <regex> // on rhel7 gcc 4.8, not competent
@@ -106,6 +108,15 @@ using namespace std;
#endif
+inline bool
+string_endswith(const string& haystack, const string& needle)
+{
+ return (haystack.size() >= needle.size() &&
+ equal(haystack.end()-needle.size(), haystack.end(),
+ needle.begin()));
+}
+
+
// Roll this identifier for every sqlite schema incompatiblity.
#define BUILDIDS "buildids9"
@@ -231,9 +242,9 @@ static const char DEBUGINFOD_SQLITE_DDL[] =
"create view if not exists " BUILDIDS "_stats as\n"
" select 'file d/e' as label,count(*) as quantity from " BUILDIDS "_f_de\n"
"union all select 'file s',count(*) from " BUILDIDS "_f_s\n"
- "union all select 'rpm d/e',count(*) from " BUILDIDS "_r_de\n"
- "union all select 'rpm sref',count(*) from " BUILDIDS "_r_sref\n"
- "union all select 'rpm sdef',count(*) from " BUILDIDS "_r_sdef\n"
+ "union all select 'archive d/e',count(*) from " BUILDIDS "_r_de\n"
+ "union all select 'archive sref',count(*) from " BUILDIDS "_r_sref\n"
+ "union all select 'archive sdef',count(*) from " BUILDIDS "_r_sdef\n"
"union all select 'buildids',count(*) from " BUILDIDS "_buildids\n"
"union all select 'filenames',count(*) from " BUILDIDS "_files\n"
"union all select 'files scanned (#)',count(*) from " BUILDIDS "_file_mtime_scanned\n"
@@ -322,9 +333,11 @@ ARGP_PROGRAM_BUG_ADDRESS_DEF = PACKAGE_BUGREPORT;
static const struct argp_option options[] =
{
{ NULL, 0, NULL, 0, "Scanners:", 1 },
- { "scan-file-dir", 'F', NULL, 0, "Enable ELF/DWARF file scanning threads.", 0 },
- { "scan-rpm-dir", 'R', NULL, 0, "Enable RPM scanning threads.", 0 },
- // "source-oci-imageregistry" ...
+ { "scan-file-dir", 'F', NULL, 0, "Enable ELF/DWARF file scanning.", 0 },
+ { "scan-rpm-dir", 'R', NULL, 0, "Enable RPM scanning.", 0 },
+ { "scan-deb-dir", 'U', NULL, 0, "Enable DEB scanning.", 0 },
+ { "scan-archive", 'Z', "EXT=CMD", 0, "Enable arbitrary archive scanning.", 0 },
+ // "source-oci-imageregistry" ...
{ NULL, 0, NULL, 0, "Options:", 2 },
{ "logical", 'L', NULL, 0, "Follow symlinks, default=ignore.", 0 },
@@ -338,7 +351,12 @@ static const struct argp_option options[] =
{ "database", 'd', "FILE", 0, "Path to sqlite database.", 0 },
{ "ddl", 'D', "SQL", 0, "Apply extra sqlite ddl/pragma to connection.", 0 },
{ "verbose", 'v', NULL, 0, "Increase verbosity.", 0 },
-
+#define ARGP_KEY_FDCACHE_FDS 0x1001
+ { "fdcache-fds", ARGP_KEY_FDCACHE_FDS, "NUM", 0, "Maximum number of archive files to keep in fdcache.", 0 },
+#define ARGP_KEY_FDCACHE_MBS 0x1002
+ { "fdcache-mbs", ARGP_KEY_FDCACHE_MBS, "MB", 0, "Maximum total size of archive file fdcache.", 0 },
+#define ARGP_KEY_FDCACHE_PREFETCH 0x1003
+ { "fdcache-prefetch", ARGP_KEY_FDCACHE_PREFETCH, "NUM", 0, "Number of archive files to prefetch into fdcache.", 0 },
{ NULL, 0, NULL, 0, NULL, 0 }
};
@@ -359,7 +377,7 @@ static struct argp argp =
static string db_path;
-static sqlite3 *db;
+static sqlite3 *db; // single connection, serialized across all our threads!
static unsigned verbose;
static volatile sig_atomic_t interrupted = 0;
static volatile sig_atomic_t sigusr1 = 0;
@@ -367,15 +385,19 @@ static volatile sig_atomic_t sigusr2 = 0;
static unsigned http_port = 8002;
static unsigned rescan_s = 300;
static unsigned groom_s = 86400;
-static unsigned maxigroom = false;
+static bool maxigroom = false;
static unsigned concurrency = std::thread::hardware_concurrency() ?: 1;
static set<string> source_paths;
static bool scan_files = false;
-static bool scan_rpms = false;
+static map<string,string> scan_archives;
static vector<string> extra_ddl;
static regex_t file_include_regex;
static regex_t file_exclude_regex;
static bool traverse_logical;
+static long fdcache_fds;
+static long fdcache_mbs;
+static long fdcache_prefetch;
+static string tmpdir;
static void set_metric(const string& key, int64_t value);
// static void inc_metric(const string& key);
@@ -387,6 +409,7 @@ static void inc_metric(const string& metric,
static void add_metric(const string& metric,
const string& lname, const string& lvalue,
int64_t value);
+// static void add_metric(const string& metric, int64_t value);
/* Handle program arguments. */
static error_t
@@ -399,10 +422,37 @@ parse_opt (int key, char *arg,
case 'v': verbose ++; break;
case 'd': db_path = string(arg); break;
case 'p': http_port = (unsigned) atoi(arg);
- if (http_port > 65535) argp_failure(state, 1, EINVAL, "port number");
+ if (http_port == 0 || http_port > 65535)
+ argp_failure(state, 1, EINVAL, "port number");
break;
case 'F': scan_files = true; break;
- case 'R': scan_rpms = true; break;
+ case 'R':
+ scan_archives[".rpm"]="cat"; // libarchive groks rpm natively
+ break;
+ case 'U':
+ if (access("/usr/bin/dpkg-deb", X_OK) == 0)
+ {
+ scan_archives[".deb"]="dpkg-deb --fsys-tarfile";
+ scan_archives[".ddeb"]="dpkg-deb --fsys-tarfile";
+ }
+ else
+ {
+ scan_archives[".deb"]="(bsdtar -O -x -f - data.tar.xz)<";
+ scan_archives[".ddeb"]="(bsdtar -O -x -f - data.tar.xz)<";
+ }
+ // .udeb too?
+ break;
+ case 'Z':
+ {
+ char* extension = strchr(arg, '=');
+ if (arg[0] == '\0')
+ argp_failure(state, 1, EINVAL, "missing EXT");
+ else if (extension)
+ scan_archives[string(arg, (extension-arg))]=string(extension+1);
+ else
+ scan_archives[string(arg)]=string("cat");
+ }
+ break;
case 'L':
traverse_logical = true;
break;
@@ -433,6 +483,15 @@ parse_opt (int key, char *arg,
if (rc != 0)
argp_failure(state, 1, EINVAL, "regular expession");
break;
+ case ARGP_KEY_FDCACHE_FDS:
+ fdcache_fds = atol (arg);
+ break;
+ case ARGP_KEY_FDCACHE_MBS:
+ fdcache_mbs = atol (arg);
+ break;
+ case ARGP_KEY_FDCACHE_PREFETCH:
+ fdcache_prefetch = atol (arg);
+ break;
case ARGP_KEY_ARG:
source_paths.insert(string(arg));
break;
@@ -503,39 +562,84 @@ struct elfutils_exception: public reportable_exception
////////////////////////////////////////////////////////////////////////
-// a c++ counting-semaphore class ... since we're c++11 not c++20
-
-class semaphore
+template <typename Payload>
+class workq
{
+ set<Payload> q; // eliminate duplicates
+ mutex mtx;
+ condition_variable cv;
+ bool dead;
+ unsigned idlers;
+
public:
- semaphore (unsigned c=1): count(c) {}
- inline void notify () {
+ workq() { dead = false; idlers = 0; }
+ ~workq() {}
+
+ void push_back(const Payload& p)
+ {
unique_lock<mutex> lock(mtx);
- count++;
- cv.notify_one();
+ q.insert (p);
+ set_metric("thread_work_pending","role","scan", q.size());
+ cv.notify_all();
}
- inline void wait() {
+
+ // kill this workqueue, wake up all idlers / scanners
+ void nuke() {
unique_lock<mutex> lock(mtx);
- while (count == 0)
+ // optional: q.clear();
+ dead = true;
+ cv.notify_all();
+ }
+
+ // block this scanner thread until there is work to do and no active
+ bool wait_front (Payload& p)
+ {
+ unique_lock<mutex> lock(mtx);
+ while (!dead && (q.size() == 0 || idlers > 0))
cv.wait(lock);
- count--;
+ if (dead)
+ return false;
+ else
+ {
+ p = * q.begin();
+ q.erase (q.begin());
+ set_metric("thread_work_pending","role","scan", q.size());
+ if (q.size() == 0)
+ cv.notify_all(); // maybe wake up waiting idlers
+ return true;
+ }
}
-private:
- mutex mtx;
- condition_variable cv;
- unsigned count;
-};
+ // block this idler thread until there is no work to do
+ void wait_idle ()
+ {
+ unique_lock<mutex> lock(mtx);
+ cv.notify_all(); // maybe wake up waiting scanners
+ while (!dead && (q.size() != 0))
+ cv.wait(lock);
+ idlers ++;
+ }
-class semaphore_borrower
-{
-public:
- semaphore_borrower(semaphore* s): sem(s) { sem->wait(); }
- ~semaphore_borrower() { sem->notify(); }
-private:
- semaphore* sem;
+ void done_idle ()
+ {
+ unique_lock<mutex> lock(mtx);
+ idlers --;
+ cv.notify_all(); // maybe wake up waiting scanners, but probably not (shutting down)
+ }
};
+typedef struct stat stat_t;
+typedef pair<string,stat_t> scan_payload;
+inline bool operator< (const scan_payload& a, const scan_payload& b)
+{
+ return a.first < b.first; // don't bother compare the stat fields
+}
+static workq<scan_payload> scanq; // just a single one
+// producer & idler: thread_main_fts_source_paths()
+// consumer: thread_main_scanner()
+// idler: thread_main_groom()
+
+
////////////////////////////////////////////////////////////////////////
@@ -706,7 +810,17 @@ private:
////////////////////////////////////////////////////////////////////////
-
+static string
+header_censor(const string& str)
+{
+ string y;
+ for (auto&& x : str)
+ {
+ if (isalnum(x) || x == '/' || x == '.' || x == ',' || x == '_' || x == ':')
+ y += x;
+ }
+ return y;
+}
static string
@@ -735,13 +849,21 @@ conninfo (struct MHD_Connection * conn)
hostname[0] = servname[0] = '\0';
}
- return string(hostname) + string(":") + string(servname);
+ // extract headers relevant to administration
+ const char* user_agent = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "User-Agent") ?: "";
+ const char* x_forwarded_for = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "X-Forwarded-For") ?: "";
+ // NB: these are untrustworthy, beware if machine-processing log files
+
+ return string(hostname) + string(":") + string(servname) +
+ string(" UA:") + header_censor(string(user_agent)) +
+ string(" XFF:") + header_censor(string(x_forwarded_for));
}
////////////////////////////////////////////////////////////////////////
+
static void
add_mhd_last_modified (struct MHD_Response *resp, time_t mtime)
{
@@ -833,6 +955,268 @@ shell_escape(const string& str)
}
+// PR25548: Perform POSIX / RFC3986 style path canonicalization on the input string.
+//
+// Namely:
+// // -> /
+// /foo/../ -> /
+// /./ -> /
+//
+// This mapping is done on dwarf-side source path names, which may
+// include these constructs, so we can deal with debuginfod clients
+// that accidentally canonicalize the paths.
+//
+// realpath(3) is close but not quite right, because it also resolves
+// symbolic links. Symlinks at the debuginfod server have nothing to
+// do with the build-time symlinks, thus they must not be considered.
+//
+// see also curl Curl_dedotdotify() aka RFC3986, which we mostly follow here
+// see also libc __realpath()
+// see also llvm llvm::sys::path::remove_dots()
+static string
+canon_pathname (const string& input)
+{
+ string i = input; // 5.2.4 (1)
+ string o;
+
+ while (i.size() != 0)
+ {
+ // 5.2.4 (2) A
+ if (i.substr(0,3) == "../")
+ i = i.substr(3);
+ else if(i.substr(0,2) == "./")
+ i = i.substr(2);
+
+ // 5.2.4 (2) B
+ else if (i.substr(0,3) == "/./")
+ i = i.substr(2);
+ else if (i == "/.")
+ i = ""; // no need to handle "/." complete-path-segment case; we're dealing with file names
+
+ // 5.2.4 (2) C
+ else if (i.substr(0,4) == "/../") {
+ i = i.substr(3);
+ string::size_type sl = o.rfind("/");
+ if (sl != string::npos)
+ o = o.substr(0, sl);
+ else
+ o = "";
+ } else if (i == "/..")
+ i = ""; // no need to handle "/.." complete-path-segment case; we're dealing with file names
+
+ // 5.2.4 (2) D
+ // no need to handle these cases; we're dealing with file names
+ else if (i == ".")
+ i = "";
+ else if (i == "..")
+ i = "";
+
+ // POSIX special: map // to /
+ else if (i.substr(0,2) == "//")
+ i = i.substr(1);
+
+ // 5.2.4 (2) E
+ else {
+ string::size_type next_slash = i.find("/", (i[0]=='/' ? 1 : 0)); // skip first slash
+ o += i.substr(0, next_slash);
+ if (next_slash == string::npos)
+ i = "";
+ else
+ i = i.substr(next_slash);
+ }
+ }
+
+ return o;
+}
+
+
+
+// A map-like class that owns a cache of file descriptors (indexed by
+// file / content names).
+//
+// If only it could use fd's instead of file names ... but we can't
+// dup(2) to create independent descriptors for the same unlinked
+// files, so would have to use some goofy linux /proc/self/fd/%d
+// hack such as the following
+
+#if 0
+int superdup(int fd)
+{
+#ifdef __linux__
+ char *fdpath = NULL;
+ int rc = asprintf(& fdpath, "/proc/self/fd/%d", fd);
+ int newfd;
+ if (rc >= 0)
+ newfd = open(fdpath, O_RDONLY);
+ else
+ newfd = -1;
+ free (fdpath);
+ return newfd;
+#else
+ return -1;
+#endif
+}
+#endif
+
+class libarchive_fdcache
+{
+private:
+ mutex fdcache_lock;
+
+ struct fdcache_entry
+ {
+ string archive;
+ string entry;
+ string fd;
+ double fd_size_mb; // slightly rounded up megabytes
+ };
+ deque<fdcache_entry> lru; // @head: most recently used
+ long max_fds;
+ long max_mbs;
+
+public:
+ void intern(const string& a, const string& b, string fd, off_t sz, bool front_p)
+ {
+ {
+ unique_lock<mutex> lock(fdcache_lock);
+ for (auto i = lru.begin(); i < lru.end(); i++) // nuke preexisting copy
+ {
+ if (i->archive == a && i->entry == b)
+ {
+ unlink (i->fd.c_str());
+ lru.erase(i);
+ break; // must not continue iterating
+ }
+ }
+ double mb = (sz+65535)/1048576.0; // round up to 64K block
+ fdcache_entry n = { a, b, fd, mb };
+ if (front_p)
+ lru.push_front(n);
+ else
+ lru.push_back(n);
+ if (verbose > 3)
+ obatched(clog) << "fdcache interned a=" << a << " b=" << b
+ << " fd=" << fd << " mb=" << mb << " front=" << front_p << endl;
+ }
+
+ // NB: we age the cache at lookup time too
+ if (front_p)
+ this->limit(max_fds, max_mbs); // age cache if required
+ }
+
+ int lookup(const string& a, const string& b)
+ {
+ int fd = -1;
+ {
+ unique_lock<mutex> lock(fdcache_lock);
+ for (auto i = lru.begin(); i < lru.end(); i++)
+ {
+ if (i->archive == a && i->entry == b)
+ { // found it; move it to head of lru
+ fdcache_entry n = *i;
+ lru.erase(i); // invalidates i, so no more iteration!
+ lru.push_front(n);
+
+ fd = open(n.fd.c_str(), O_RDONLY); // NB: no problem if dup() fails; looks like cache miss
+ break;
+ }
+ }
+ }
+
+ if (fd >= 0)
+ this->limit(max_fds, max_mbs); // age cache if required
+
+ return fd;
+ }
+
+ int probe(const string& a, const string& b) // just a cache residency check - don't modify LRU state, don't open
+ {
+ unique_lock<mutex> lock(fdcache_lock);
+ for (auto i = lru.begin(); i < lru.end(); i++)
+ {
+ if (i->archive == a && i->entry == b)
+ return true;
+ }
+ return false;
+ }
+
+ void clear(const string& a, const string& b)
+ {
+ unique_lock<mutex> lock(fdcache_lock);
+ for (auto i = lru.begin(); i < lru.end(); i++)
+ {
+ if (i->archive == a && i->entry == b)
+ { // found it; move it to head of lru
+ fdcache_entry n = *i;
+ lru.erase(i); // invalidates i, so no more iteration!
+ unlink (n.fd.c_str());
+ return;
+ }
+ }
+ }
+
+ void limit(long maxfds, long maxmbs)
+ {
+ if (verbose > 3 && (this->max_fds != maxfds || this->max_mbs != maxmbs))
+ obatched(clog) << "fdcache limited to maxfds=" << maxfds << " maxmbs=" << maxmbs << endl;
+
+ unique_lock<mutex> lock(fdcache_lock);
+ this->max_fds = maxfds;
+ this->max_mbs = maxmbs;
+
+ long total_fd = 0;
+ double total_mb = 0.0;
+ for (auto i = lru.begin(); i < lru.end(); i++)
+ {
+ // accumulate totals from most recently used one going backward
+ total_fd ++;
+ total_mb += i->fd_size_mb;
+ if (total_fd > max_fds || total_mb > max_mbs)
+ {
+ // found the cut here point!
+
+ for (auto j = i; j < lru.end(); j++) // close all the fds from here on in
+ {
+ if (verbose > 3)
+ obatched(clog) << "fdcache evicted a=" << j->archive << " b=" << j->entry
+ << " fd=" << j->fd << " mb=" << j->fd_size_mb << endl;
+ unlink (j->fd.c_str());
+ }
+
+ lru.erase(i, lru.end()); // erase the nodes generally
+ break;
+ }
+
+ }
+ }
+
+ ~libarchive_fdcache()
+ {
+ limit(0, 0);
+ }
+};
+static libarchive_fdcache fdcache;
+
+
+// For security/portability reasons, many distro-package archives have
+// a "./" in front of path names; others have nothing, others have
+// "/". Canonicalize them all to a single leading "/", with the
+// assumption that this matches the dwarf-derived file names too.
+string canonicalized_archive_entry_pathname(struct archive_entry *e)
+{
+ string fn = archive_entry_pathname(e);
+ if (fn.size() == 0)
+ return fn;
+ if (fn[0] == '/')
+ return fn;
+ if (fn[0] == '.')
+ return fn.substr(1);
+ else
+ return string("/")+fn;
+}
+
+
+
static struct MHD_Response*
handle_buildid_r_match (int64_t b_mtime,
const string& b_source0,
@@ -851,11 +1235,69 @@ handle_buildid_r_match (int64_t b_mtime,
return 0;
}
- string popen_cmd = string("rpm2cpio " + shell_escape(b_source0));
- FILE* fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
- if (fp == NULL)
- throw libc_exception (errno, string("popen ") + popen_cmd);
- defer_dtor<FILE*,int> fp_closer (fp, pclose);
+ // check for a match in the fdcache first
+ int fd = fdcache.lookup(b_source0, b_source1);
+ while (fd >= 0) // got one!; NB: this is really an if() with a possible branch out to the end
+ {
+ rc = fstat(fd, &fs);
+ if (rc < 0) // disappeared?
+ {
+ if (verbose)
+ obatched(clog) << "cannot fstat fdcache " << b_source0 << endl;
+ close(fd);
+ fdcache.clear(b_source0, b_source1);
+ break; // branch out of if "loop", to try new libarchive fetch attempt
+ }
+
+ struct MHD_Response* r = MHD_create_response_from_fd (fs.st_size, fd);
+ if (r == 0)
+ {
+ if (verbose)
+ obatched(clog) << "cannot create fd-response for " << b_source0 << endl;
+ close(fd);
+ break; // branch out of if "loop", to try new libarchive fetch attempt
+ }
+
+ inc_metric ("http_responses_total","result","archive fdcache");
+
+ MHD_add_response_header (r, "Content-Type", "application/octet-stream");
+ add_mhd_last_modified (r, fs.st_mtime);
+ if (verbose > 1)
+ obatched(clog) << "serving fdcache archive " << b_source0 << " file " << b_source1 << endl;
+ /* libmicrohttpd will close it. */
+ if (result_fd)
+ *result_fd = fd;
+ return r;
+ // NB: see, we never go around the 'loop' more than once
+ }
+
+ // no match ... grumble, must process the archive
+ string archive_decoder = "/dev/null";
+ string archive_extension = "";
+ for (auto&& arch : scan_archives)
+ if (string_endswith(b_source0, arch.first))
+ {
+ archive_extension = arch.first;
+ archive_decoder = arch.second;
+ }
+ FILE* fp;
+ defer_dtor<FILE*,int>::dtor_fn dfn;
+ if (archive_decoder != "cat")
+ {
+ string popen_cmd = archive_decoder + " " + shell_escape(b_source0);
+ fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
+ dfn = pclose;
+ if (fp == NULL)
+ throw libc_exception (errno, string("popen ") + popen_cmd);
+ }
+ else
+ {
+ fp = fopen (b_source0.c_str(), "r");
+ dfn = fclose;
+ if (fp == NULL)
+ throw libc_exception (errno, string("fopen ") + b_source0);
+ }
+ defer_dtor<FILE*,int> fp_closer (fp, dfn);
struct archive *a;
a = archive_read_new();
@@ -863,19 +1305,30 @@ handle_buildid_r_match (int64_t b_mtime,
throw archive_exception("cannot create archive reader");
defer_dtor<struct archive*,int> archive_closer (a, archive_read_free);
- rc = archive_read_support_format_cpio(a);
+ rc = archive_read_support_format_all(a);
if (rc != ARCHIVE_OK)
- throw archive_exception(a, "cannot select cpio format");
+ throw archive_exception(a, "cannot select all format");
rc = archive_read_support_filter_all(a);
if (rc != ARCHIVE_OK)
throw archive_exception(a, "cannot select all filters");
rc = archive_read_open_FILE (a, fp);
if (rc != ARCHIVE_OK)
- throw archive_exception(a, "cannot open archive from rpm2cpio pipe");
+ throw archive_exception(a, "cannot open archive from pipe");
- while(1) // parse cpio archive entries
+ // archive traversal is in three stages, no, four stages:
+ // 1) skip entries whose names do not match the requested one
+ // 2) extract the matching entry name (set r = result)
+ // 3) extract some number of prefetched entries (just into fdcache)
+ // 4) abort any further processing
+ struct MHD_Response* r = 0; // will set in stage 2
+ unsigned prefetch_count = fdcache_prefetch; // will decrement in stage 3
+
+ while(r == 0 || prefetch_count > 0) // stage 1, 2, or 3
{
+ if (interrupted)
+ break;
+
struct archive_entry *e;
rc = archive_read_next_header (a, &e);
if (rc != ARCHIVE_OK)
@@ -884,48 +1337,81 @@ handle_buildid_r_match (int64_t b_mtime,
if (! S_ISREG(archive_entry_mode (e))) // skip non-files completely
continue;
- string fn = archive_entry_pathname (e);
- if (fn != string(".")+b_source1)
+ string fn = canonicalized_archive_entry_pathname (e);
+ if ((r == 0) && (fn != b_source1)) // stage 1
+ continue;
+
+ if (fdcache.probe (b_source0, fn)) // skip if already interned
continue;
// extract this file to a temporary file
- char tmppath[PATH_MAX] = "/tmp/debuginfod.XXXXXX"; // XXX: $TMP_DIR etc.
- int fd = mkstemp (tmppath);
+ char* tmppath = NULL;
+ rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir.c_str());
+ if (rc < 0)
+ throw libc_exception (ENOMEM, "cannot allocate tmppath");
+ defer_dtor<void*,void> tmmpath_freer (tmppath, free);
+ fd = mkstemp (tmppath);
if (fd < 0)
throw libc_exception (errno, "cannot create temporary file");
- unlink (tmppath); // unlink now so OS will release the file as soon as we close the fd
+ // NB: don't unlink (tmppath), as fdcache will take charge of it.
rc = archive_read_data_into_fd (a, fd);
- if (rc != ARCHIVE_OK)
+ if (rc != ARCHIVE_OK) // e.g. ENOSPC!
{
close (fd);
+ unlink (tmppath);
throw archive_exception(a, "cannot extract file");
}
- inc_metric ("http_responses_total","result","rpm");
- struct MHD_Response* r = MHD_create_response_from_fd (archive_entry_size(e), fd);
+ // Set the mtime so the fdcache file mtimes, even prefetched ones,
+ // propagate to future webapi clients.
+ struct timeval tvs[2];
+ tvs[0].tv_sec = tvs[1].tv_sec = archive_entry_mtime(e);
+ tvs[0].tv_usec = tvs[1].tv_usec = 0;
+ (void) futimes (fd, tvs); /* best effort */
+
+ if (r != 0) // stage 3
+ {
+ // NB: now we know we have a complete reusable file; make fdcache
+ // responsible for unlinking it later.
+ fdcache.intern(b_source0, fn,
+ tmppath, archive_entry_size(e),
+ false); // prefetched ones go to back of lru
+ prefetch_count --;
+ close (fd); // we're not saving this fd to make a mhd-response from!
+ continue;
+ }
+
+ // NB: now we know we have a complete reusable file; make fdcache
+ // responsible for unlinking it later.
+ fdcache.intern(b_source0, b_source1,
+ tmppath, archive_entry_size(e),
+ true); // requested ones go to the front of lru
+
+ inc_metric ("http_responses_total","result",archive_extension + " archive");
+ r = MHD_create_response_from_fd (archive_entry_size(e), fd);
if (r == 0)
{
if (verbose)
obatched(clog) << "cannot create fd-response for " << b_source0 << endl;
close(fd);
- break; // assume no chance of better luck around another iteration
+ break; // assume no chance of better luck around another iteration; no other copies of same file
}
else
{
MHD_add_response_header (r, "Content-Type", "application/octet-stream");
add_mhd_last_modified (r, archive_entry_mtime(e));
if (verbose > 1)
- obatched(clog) << "serving rpm " << b_source0 << " file " << b_source1 << endl;
+ obatched(clog) << "serving archive " << b_source0 << " file " << b_source1 << endl;
/* libmicrohttpd will close it. */
if (result_fd)
*result_fd = fd;
- return r;
+ continue;
}
}
// XXX: rpm/file not found: delete this R entry?
- return 0;
+ return r;
}
@@ -955,11 +1441,12 @@ debuginfod_find_progress (debuginfod_client *, long a, long b)
}
-static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
- const string& artifacttype /* unsafe */,
- const string& suffix /* unsafe */,
- int *result_fd
- )
+static struct MHD_Response*
+handle_buildid (MHD_Connection* conn,
+ const string& buildid /* unsafe */,
+ const string& artifacttype /* unsafe */,
+ const string& suffix /* unsafe */,
+ int *result_fd)
{
// validate artifacttype
string atype_code;
@@ -1001,12 +1488,17 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
}
else if (atype_code == "S")
{
+ // PR25548
+ // Incoming source queries may come in with either dwarf-level OR canonicalized paths.
+ // We let the query pass with either one.
+
pp = new sqlite_ps (db, "mhd-query-s",
- "select mtime, sourcetype, source0, source1 from " BUILDIDS "_query_s where buildid = ? and artifactsrc = ? "
+ "select mtime, sourcetype, source0, source1 from " BUILDIDS "_query_s where buildid = ? and artifactsrc in (?,?) "
"order by sharedprefix(source0,source0ref) desc, mtime desc");
pp->reset();
pp->bind(1, buildid);
pp->bind(2, suffix);
+ pp->bind(3, canon_pathname(suffix));
}
unique_ptr<sqlite_ps> ps_closer(pp); // release pp if exception or return
@@ -1043,6 +1535,35 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
{
debuginfod_set_progressfn (client, & debuginfod_find_progress);
+ if (conn)
+ {
+ // Transcribe incoming User-Agent:
+ string ua = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "User-Agent") ?: "";
+ string ua_complete = string("User-Agent: ") + ua;
+ debuginfod_add_http_header (client, ua_complete.c_str());
+
+ // Compute larger XFF:, for avoiding info loss during
+ // federation, and for future cyclicity detection.
+ string xff = MHD_lookup_connection_value (conn, MHD_HEADER_KIND, "X-Forwarded-For") ?: "";
+ if (xff != "")
+ xff += string(", "); // comma separated list
+
+ // Compute the client's numeric IP address only - so can't merge with conninfo()
+ const union MHD_ConnectionInfo *u = MHD_get_connection_info (conn,
+ MHD_CONNECTION_INFO_CLIENT_ADDRESS);
+ struct sockaddr *so = u ? u->client_addr : 0;
+ char hostname[256] = ""; // RFC1035
+ if (so && so->sa_family == AF_INET)
+ (void) getnameinfo (so, sizeof (struct sockaddr_in), hostname, sizeof (hostname), NULL, 0,
+ NI_NUMERICHOST);
+ else if (so && so->sa_family == AF_INET6)
+ (void) getnameinfo (so, sizeof (struct sockaddr_in6), hostname, sizeof (hostname), NULL, 0,
+ NI_NUMERICHOST);
+
+ string xff_complete = string("X-Forwarded-For: ")+xff+string(hostname);
+ debuginfod_add_http_header (client, xff_complete.c_str());
+ }
+
if (artifacttype == "debuginfo")
fd = debuginfod_find_debuginfo (client,
(const unsigned char*) buildid.c_str(),
@@ -1081,8 +1602,16 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
}
close (fd);
}
- else if (fd != -ENOSYS) // no DEBUGINFOD_URLS configured
- throw libc_exception(-fd, "upstream debuginfod query failed");
+ else
+ switch(fd)
+ {
+ case -ENOSYS:
+ break;
+ case -ENOENT:
+ break;
+ default: // some more tricky error
+ throw libc_exception(-fd, "upstream debuginfod query failed");
+ }
throw reportable_exception(MHD_HTTP_NOT_FOUND, "not found");
}
@@ -1157,13 +1686,22 @@ add_metric(const string& metric,
unique_lock<mutex> lock(metrics_lock);
metrics[key] += value;
}
+#if 0
+static void
+add_metric(const string& metric,
+ int64_t value)
+{
+ unique_lock<mutex> lock(metrics_lock);
+ metrics[metric] += value;
+}
+#endif
// and more for higher arity labels if needed
static struct MHD_Response*
-handle_metrics ()
+handle_metrics (off_t* size)
{
stringstream o;
{
@@ -1175,6 +1713,7 @@ handle_metrics ()
MHD_Response* r = MHD_create_response_from_buffer (os.size(),
(void*) os.c_str(),
MHD_RESPMEM_MUST_COPY);
+ *size = os.size();
MHD_add_response_header (r, "Content-Type", "text/plain");
return r;
}
@@ -1197,8 +1736,11 @@ handler_cb (void * /*cls*/,
struct MHD_Response *r = NULL;
string url_copy = url;
- if (verbose)
- obatched(clog) << conninfo(connection) << " " << method << " " << url << endl;
+ int rc = MHD_NO; // mhd
+ int http_code = 500;
+ off_t http_size = -1;
+ struct timeval tv_start, tv_end;
+ gettimeofday (&tv_start, NULL);
try
{
@@ -1231,12 +1773,21 @@ handler_cb (void * /*cls*/,
}
inc_metric("http_requests_total", "type", artifacttype);
- r = handle_buildid(buildid, artifacttype, suffix, 0); // NB: don't care about result-fd
+ // get the resulting fd so we can report its size
+ int fd;
+ r = handle_buildid(connection, buildid, artifacttype, suffix, &fd);
+ if (r)
+ {
+ struct stat fs;
+ if (fstat(fd, &fs) == 0)
+ http_size = fs.st_size;
+ // libmicrohttpd will close (fd);
+ }
}
else if (url1 == "/metrics")
{
inc_metric("http_requests_total", "type", "metrics");
- r = handle_metrics();
+ r = handle_metrics(& http_size);
}
else
throw reportable_exception("webapi error, unrecognized /operation");
@@ -1244,16 +1795,39 @@ handler_cb (void * /*cls*/,
if (r == 0)
throw reportable_exception("internal error, missing response");
- int rc = MHD_queue_response (connection, MHD_HTTP_OK, r);
+ rc = MHD_queue_response (connection, MHD_HTTP_OK, r);
+ http_code = MHD_HTTP_OK;
MHD_destroy_response (r);
- return rc;
}
catch (const reportable_exception& e)
{
inc_metric("http_responses_total","result","error");
e.report(clog);
- return e.mhd_send_response (connection);
+ http_code = e.code;
+ http_size = e.message.size();
+ rc = e.mhd_send_response (connection);
}
+
+ gettimeofday (&tv_end, NULL);
+ double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
+ obatched(clog) << conninfo(connection)
+ << ' ' << method << ' ' << url
+ << ' ' << http_code << ' ' << http_size
+ << ' ' << (int)(deltas*1000) << "ms"
+ << endl;
+
+ // related prometheus metrics
+ string http_code_str = to_string(http_code);
+ if (http_size >= 0)
+ add_metric("http_responses_transfer_bytes_sum","code",http_code_str,
+ http_size);
+ inc_metric("http_responses_transfer_bytes_count","code",http_code_str);
+
+ add_metric("http_responses_duration_milliseconds_sum","code",http_code_str,
+ deltas*1000); // prometheus prefers _seconds and floating point
+ inc_metric("http_responses_duration_milliseconds_count","code",http_code_str);
+
+ return rc;
}
@@ -1300,7 +1874,7 @@ dwarf_extract_source_paths (Elf *elf, set<string>& debug_sourcefiles)
struct MHD_Response *r = 0;
try
{
- r = handle_buildid (buildid, "debuginfo", "", &alt_fd);
+ r = handle_buildid (0, buildid, "debuginfo", "", &alt_fd);
}
catch (const reportable_exception& e)
{
@@ -1399,7 +1973,8 @@ dwarf_extract_source_paths (Elf *elf, set<string>& debug_sourcefiles)
waldo = (string (comp_dir) + string("/") + string (hat));
else
{
- obatched(clog) << "skipping hat=" << hat << " due to empty comp_dir" << endl;
+ if (verbose > 3)
+ obatched(clog) << "skipping hat=" << hat << " due to empty comp_dir" << endl;
continue;
}
@@ -1544,334 +2119,222 @@ elf_classify (int fd, bool &executable_p, bool &debuginfo_p, string &buildid, se
}
-static semaphore* scan_concurrency_sem = 0; // used to implement -c load limiting
-
-
static void
-scan_source_file_path (const string& dir)
+scan_source_file (const string& rps, const stat_t& st,
+ sqlite_ps& ps_upsert_buildids,
+ sqlite_ps& ps_upsert_files,
+ sqlite_ps& ps_upsert_de,
+ sqlite_ps& ps_upsert_s,
+ sqlite_ps& ps_query,
+ sqlite_ps& ps_scan_done,
+ unsigned& fts_cached,
+ unsigned& fts_executable,
+ unsigned& fts_debuginfo,
+ unsigned& fts_sourcefiles)
{
- obatched(clog) << "fts/file traversing " << dir << endl;
+ /* See if we know of it already. */
+ int rc = ps_query
+ .reset()
+ .bind(1, rps)
+ .bind(2, st.st_mtime)
+ .step();
+ ps_query.reset();
+ if (rc == SQLITE_ROW) // i.e., a result, as opposed to DONE (no results)
+ // no need to recheck a file/version we already know
+ // specifically, no need to elf-begin a file we already determined is non-elf
+ // (so is stored with buildid=NULL)
+ {
+ fts_cached++;
+ return;
+ }
- struct timeval tv_start, tv_end;
- gettimeofday (&tv_start, NULL);
-
- sqlite_ps ps_upsert_buildids (db, "file-buildids-intern", "insert or ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
- sqlite_ps ps_upsert_files (db, "file-files-intern", "insert or ignore into " BUILDIDS "_files VALUES (NULL, ?);");
- sqlite_ps ps_upsert_de (db, "file-de-upsert",
- "insert or ignore into " BUILDIDS "_f_de "
- "(buildid, debuginfo_p, executable_p, file, mtime) "
- "values ((select id from " BUILDIDS "_buildids where hex = ?),"
- " ?,?,"
- " (select id from " BUILDIDS "_files where name = ?), ?);");
- sqlite_ps ps_upsert_s (db, "file-s-upsert",
- "insert or ignore into " BUILDIDS "_f_s "
- "(buildid, artifactsrc, file, mtime) "
- "values ((select id from " BUILDIDS "_buildids where hex = ?),"
- " (select id from " BUILDIDS "_files where name = ?),"
- " (select id from " BUILDIDS "_files where name = ?),"
- " ?);");
- sqlite_ps ps_query (db, "file-negativehit-find",
- "select 1 from " BUILDIDS "_file_mtime_scanned where sourcetype = 'F' and file = (select id from " BUILDIDS "_files where name = ?) and mtime = ?;");
- sqlite_ps ps_scan_done (db, "file-scanned",
- "insert or ignore into " BUILDIDS "_file_mtime_scanned (sourcetype, file, mtime, size)"
- "values ('F', (select id from " BUILDIDS "_files where name = ?), ?, ?);");
+ bool executable_p = false, debuginfo_p = false; // E and/or D
+ string buildid;
+ set<string> sourcefiles;
+ int fd = open (rps.c_str(), O_RDONLY);
+ try
+ {
+ if (fd >= 0)
+ elf_classify (fd, executable_p, debuginfo_p, buildid, sourcefiles);
+ else
+ throw libc_exception(errno, string("open ") + rps);
+ inc_metric ("scanned_total","source","file");
+ }
+ // NB: we catch exceptions here too, so that we can
+ // cache the corrupt-elf case (!executable_p &&
+ // !debuginfo_p) just below, just as if we had an
+ // EPERM error from open(2).
+ catch (const reportable_exception& e)
+ {
+ e.report(clog);
+ }
- char * const dirs[] = { (char*) dir.c_str(), NULL };
+ if (fd >= 0)
+ close (fd);
- unsigned fts_scanned=0, fts_regex=0, fts_cached=0, fts_debuginfo=0, fts_executable=0, fts_sourcefiles=0;
+ // register this file name in the interning table
+ ps_upsert_files
+ .reset()
+ .bind(1, rps)
+ .step_ok_done();
- FTS *fts = fts_open (dirs,
- (traverse_logical ? FTS_LOGICAL : FTS_PHYSICAL|FTS_XDEV)
- | FTS_NOCHDIR /* multithreaded */,
- NULL);
- if (fts == NULL)
+ if (buildid == "")
{
- obatched(cerr) << "cannot fts_open " << dir << endl;
- return;
+ // no point storing an elf file without buildid
+ executable_p = false;
+ debuginfo_p = false;
}
-
- FTSENT *f;
- while ((f = fts_read (fts)) != NULL)
+ else
{
- semaphore_borrower handle_one_file (scan_concurrency_sem);
+ // register this build-id in the interning table
+ ps_upsert_buildids
+ .reset()
+ .bind(1, buildid)
+ .step_ok_done();
+ }
- fts_scanned ++;
- if (interrupted)
- break;
+ if (executable_p)
+ fts_executable ++;
+ if (debuginfo_p)
+ fts_debuginfo ++;
+ if (executable_p || debuginfo_p)
+ {
+ ps_upsert_de
+ .reset()
+ .bind(1, buildid)
+ .bind(2, debuginfo_p ? 1 : 0)
+ .bind(3, executable_p ? 1 : 0)
+ .bind(4, rps)
+ .bind(5, st.st_mtime)
+ .step_ok_done();
+ }
+ if (executable_p)
+ inc_metric("found_executable_total","source","files");
+ if (debuginfo_p)
+ inc_metric("found_debuginfo_total","source","files");
- if (verbose > 2)
- obatched(clog) << "fts/file traversing " << f->fts_path << endl;
+ if (sourcefiles.size() && buildid != "")
+ {
+ fts_sourcefiles += sourcefiles.size();
- try
+ for (auto&& dwarfsrc : sourcefiles)
{
- /* Found a file. Convert it to an absolute path, so
- the buildid database does not have relative path
- names that are unresolvable from a subsequent run
- in a different cwd. */
- char *rp = realpath(f->fts_path, NULL);
- if (rp == NULL)
- continue; // ignore dangling symlink or such
- string rps = string(rp);
- free (rp);
-
- bool ri = !regexec (&file_include_regex, rps.c_str(), 0, 0, 0);
- bool rx = !regexec (&file_exclude_regex, rps.c_str(), 0, 0, 0);
- if (!ri || rx)
- {
- if (verbose > 3)
- obatched(clog) << "fts/file skipped by regex " << (!ri ? "I" : "") << (rx ? "X" : "") << endl;
- fts_regex ++;
- continue;
- }
-
- switch (f->fts_info)
- {
- case FTS_D:
- break;
-
- case FTS_DP:
- break;
-
- case FTS_F:
- {
- /* See if we know of it already. */
- int rc = ps_query
- .reset()
- .bind(1, rps)
- .bind(2, f->fts_statp->st_mtime)
- .step();
- ps_query.reset();
- if (rc == SQLITE_ROW) // i.e., a result, as opposed to DONE (no results)
- // no need to recheck a file/version we already know
- // specifically, no need to elf-begin a file we already determined is non-elf
- // (so is stored with buildid=NULL)
- {
- fts_cached ++;
- continue;
- }
-
- bool executable_p = false, debuginfo_p = false; // E and/or D
- string buildid;
- set<string> sourcefiles;
-
- int fd = open (rps.c_str(), O_RDONLY);
- try
- {
- if (fd >= 0)
- elf_classify (fd, executable_p, debuginfo_p, buildid, sourcefiles);
- else
- throw libc_exception(errno, string("open ") + rps);
- inc_metric ("scanned_total","source","file");
- }
-
- // NB: we catch exceptions here too, so that we can
- // cache the corrupt-elf case (!executable_p &&
- // !debuginfo_p) just below, just as if we had an
- // EPERM error from open(2).
-
- catch (const reportable_exception& e)
- {
- e.report(clog);
- }
-
- if (fd >= 0)
- close (fd);
-
- // register this file name in the interning table
- ps_upsert_files
- .reset()
- .bind(1, rps)
- .step_ok_done();
-
- if (buildid == "")
- {
- // no point storing an elf file without buildid
- executable_p = false;
- debuginfo_p = false;
- }
- else
- {
- // register this build-id in the interning table
- ps_upsert_buildids
- .reset()
- .bind(1, buildid)
- .step_ok_done();
- }
-
- if (executable_p)
- fts_executable ++;
- if (debuginfo_p)
- fts_debuginfo ++;
- if (executable_p || debuginfo_p)
- {
- ps_upsert_de
- .reset()
- .bind(1, buildid)
- .bind(2, debuginfo_p ? 1 : 0)
- .bind(3, executable_p ? 1 : 0)
- .bind(4, rps)
- .bind(5, f->fts_statp->st_mtime)
- .step_ok_done();
- }
- if (executable_p)
- inc_metric("found_executable_total","source","files");
- if (debuginfo_p)
- inc_metric("found_debuginfo_total","source","files");
-
- if (sourcefiles.size() && buildid != "")
- {
- fts_sourcefiles += sourcefiles.size();
-
- for (auto&& dwarfsrc : sourcefiles)
- {
- char *srp = realpath(dwarfsrc.c_str(), NULL);
- if (srp == NULL) // also if DWZ unresolved dwarfsrc=""
- continue; // unresolvable files are not a serious problem
- // throw libc_exception(errno, "fts/file realpath " + srcpath);
- string srps = string(srp);
- free (srp);
-
- struct stat sfs;
- rc = stat(srps.c_str(), &sfs);
- if (rc != 0)
- continue;
-
- if (verbose > 2)
- obatched(clog) << "recorded buildid=" << buildid << " file=" << srps
- << " mtime=" << sfs.st_mtime
- << " as source " << dwarfsrc << endl;
-
- ps_upsert_files
- .reset()
- .bind(1, srps)
- .step_ok_done();
-
- // register the dwarfsrc name in the interning table too
- ps_upsert_files
- .reset()
- .bind(1, dwarfsrc)
- .step_ok_done();
-
- ps_upsert_s
- .reset()
- .bind(1, buildid)
- .bind(2, dwarfsrc)
- .bind(3, srps)
- .bind(4, sfs.st_mtime)
- .step_ok_done();
-
- inc_metric("found_sourcerefs_total","source","files");
- }
- }
-
- ps_scan_done
- .reset()
- .bind(1, rps)
- .bind(2, f->fts_statp->st_mtime)
- .bind(3, f->fts_statp->st_size)
- .step_ok_done();
-
- if (verbose > 2)
- obatched(clog) << "recorded buildid=" << buildid << " file=" << rps
- << " mtime=" << f->fts_statp->st_mtime << " atype="
- << (executable_p ? "E" : "")
- << (debuginfo_p ? "D" : "") << endl;
- }
- break;
+ char *srp = realpath(dwarfsrc.c_str(), NULL);
+ if (srp == NULL) // also if DWZ unresolved dwarfsrc=""
+ continue; // unresolvable files are not a serious problem
+ // throw libc_exception(errno, "fts/file realpath " + srcpath);
+ string srps = string(srp);
+ free (srp);
+
+ struct stat sfs;
+ rc = stat(srps.c_str(), &sfs);
+ if (rc != 0)
+ continue;
- case FTS_ERR:
- case FTS_NS:
- throw libc_exception(f->fts_errno, string("fts/file traversal ") + string(f->fts_path));
+ if (verbose > 2)
+ obatched(clog) << "recorded buildid=" << buildid << " file=" << srps
+ << " mtime=" << sfs.st_mtime
+ << " as source " << dwarfsrc << endl;
- default:
- case FTS_SL: /* ignore symlinks; seen in non-L mode only */
- break;
- }
+ ps_upsert_files
+ .reset()
+ .bind(1, srps)
+ .step_ok_done();
- if ((verbose && f->fts_info == FTS_DP) ||
- (verbose > 1 && f->fts_info == FTS_F))
- obatched(clog) << "fts/file traversing " << rps << ", scanned=" << fts_scanned
- << ", regex-skipped=" << fts_regex
- << ", cached=" << fts_cached << ", debuginfo=" << fts_debuginfo
- << ", executable=" << fts_executable << ", source=" << fts_sourcefiles << endl;
- }
- catch (const reportable_exception& e)
- {
- e.report(clog);
- }
- }
- fts_close (fts);
+ // register the dwarfsrc name in the interning table too
+ ps_upsert_files
+ .reset()
+ .bind(1, dwarfsrc)
+ .step_ok_done();
- gettimeofday (&tv_end, NULL);
- double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
+ ps_upsert_s
+ .reset()
+ .bind(1, buildid)
+ .bind(2, dwarfsrc)
+ .bind(3, srps)
+ .bind(4, sfs.st_mtime)
+ .step_ok_done();
- obatched(clog) << "fts/file traversed " << dir << " in " << deltas << "s, scanned=" << fts_scanned
- << ", regex-skipped=" << fts_regex
- << ", cached=" << fts_cached << ", debuginfo=" << fts_debuginfo
- << ", executable=" << fts_executable << ", source=" << fts_sourcefiles << endl;
-}
+ // PR25548: also store canonicalized source path
+ string dwarfsrc_canon = canon_pathname (dwarfsrc);
+ if (dwarfsrc_canon != dwarfsrc)
+ {
+ if (verbose > 3)
+ obatched(clog) << "canonicalized src=" << dwarfsrc << " alias=" << dwarfsrc_canon << endl;
+ ps_upsert_files
+ .reset()
+ .bind(1, dwarfsrc_canon)
+ .step_ok_done();
-static void*
-thread_main_scan_source_file_path (void* arg)
-{
- string dir = string((const char*) arg);
+ ps_upsert_s
+ .reset()
+ .bind(1, buildid)
+ .bind(2, dwarfsrc_canon)
+ .bind(3, srps)
+ .bind(4, sfs.st_mtime)
+ .step_ok_done();
+ }
- unsigned rescan_timer = 0;
- sig_atomic_t forced_rescan_count = 0;
- set_metric("thread_timer_max", "file", dir, rescan_s);
- set_metric("thread_tid", "file", dir, tid());
- while (! interrupted)
- {
- set_metric("thread_timer", "file", dir, rescan_timer);
- set_metric("thread_forced_total", "file", dir, forced_rescan_count);
- if (rescan_s && rescan_timer > rescan_s)
- rescan_timer = 0;
- if (sigusr1 != forced_rescan_count)
- {
- forced_rescan_count = sigusr1;
- rescan_timer = 0;
+ inc_metric("found_sourcerefs_total","source","files");
}
- if (rescan_timer == 0)
- try
- {
- set_metric("thread_working", "file", dir, time(NULL));
- inc_metric("thread_work_total", "file", dir);
- scan_source_file_path (dir);
- set_metric("thread_working", "file", dir, 0);
- }
- catch (const sqlite_exception& e)
- {
- obatched(cerr) << e.message << endl;
- }
- sleep (1);
- rescan_timer ++;
}
- return 0;
+ ps_scan_done
+ .reset()
+ .bind(1, rps)
+ .bind(2, st.st_mtime)
+ .bind(3, st.st_size)
+ .step_ok_done();
+
+ if (verbose > 2)
+ obatched(clog) << "recorded buildid=" << buildid << " file=" << rps
+ << " mtime=" << st.st_mtime << " atype="
+ << (executable_p ? "E" : "")
+ << (debuginfo_p ? "D" : "") << endl;
}
-////////////////////////////////////////////////////////////////////////
-
-// Analyze given *.rpm file of given age; record buildids / exec/debuginfo-ness of its
+// Analyze given archive file of given age; record buildids / exec/debuginfo-ness of its
// constituent files with given upsert statements.
static void
-rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_upsert_files,
- sqlite_ps& ps_upsert_de, sqlite_ps& ps_upsert_sref, sqlite_ps& ps_upsert_sdef,
- time_t mtime,
- unsigned& fts_executable, unsigned& fts_debuginfo, unsigned& fts_sref, unsigned& fts_sdef,
- bool& fts_sref_complete_p)
+archive_classify (const string& rps, string& archive_extension,
+ sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_upsert_files,
+ sqlite_ps& ps_upsert_de, sqlite_ps& ps_upsert_sref, sqlite_ps& ps_upsert_sdef,
+ time_t mtime,
+ unsigned& fts_executable, unsigned& fts_debuginfo, unsigned& fts_sref, unsigned& fts_sdef,
+ bool& fts_sref_complete_p)
{
- string popen_cmd = string("rpm2cpio " + shell_escape(rps));
- FILE* fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
- if (fp == NULL)
- throw libc_exception (errno, string("popen ") + popen_cmd);
- defer_dtor<FILE*,int> fp_closer (fp, pclose);
+ string archive_decoder = "/dev/null";
+ for (auto&& arch : scan_archives)
+ if (string_endswith(rps, arch.first))
+ {
+ archive_extension = arch.first;
+ archive_decoder = arch.second;
+ }
+
+ FILE* fp;
+ defer_dtor<FILE*,int>::dtor_fn dfn;
+ if (archive_decoder != "cat")
+ {
+ string popen_cmd = archive_decoder + " " + shell_escape(rps);
+ fp = popen (popen_cmd.c_str(), "r"); // "e" O_CLOEXEC?
+ dfn = pclose;
+ if (fp == NULL)
+ throw libc_exception (errno, string("popen ") + popen_cmd);
+ }
+ else
+ {
+ fp = fopen (rps.c_str(), "r");
+ dfn = fclose;
+ if (fp == NULL)
+ throw libc_exception (errno, string("fopen ") + rps);
+ }
+ defer_dtor<FILE*,int> fp_closer (fp, dfn);
struct archive *a;
a = archive_read_new();
@@ -1879,19 +2342,19 @@ rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_up
throw archive_exception("cannot create archive reader");
defer_dtor<struct archive*,int> archive_closer (a, archive_read_free);
- int rc = archive_read_support_format_cpio(a);
+ int rc = archive_read_support_format_all(a);
if (rc != ARCHIVE_OK)
- throw archive_exception(a, "cannot select cpio format");
+ throw archive_exception(a, "cannot select all formats");
rc = archive_read_support_filter_all(a);
if (rc != ARCHIVE_OK)
throw archive_exception(a, "cannot select all filters");
rc = archive_read_open_FILE (a, fp);
if (rc != ARCHIVE_OK)
- throw archive_exception(a, "cannot open archive from rpm2cpio pipe");
+ throw archive_exception(a, "cannot open archive from pipe");
if (verbose > 3)
- obatched(clog) << "rpm2cpio|libarchive scanning " << rps << endl;
+ obatched(clog) << "libarchive scanning " << rps << endl;
while(1) // parse cpio archive entries
{
@@ -1905,17 +2368,14 @@ rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_up
if (! S_ISREG(archive_entry_mode (e))) // skip non-files completely
continue;
- string fn = archive_entry_pathname (e);
- if (fn.size() > 1 && fn[0] == '.')
- fn = fn.substr(1); // trim off the leading '.'
+ string fn = canonicalized_archive_entry_pathname (e);
if (verbose > 3)
- obatched(clog) << "rpm2cpio|libarchive checking " << fn << endl;
+ obatched(clog) << "libarchive checking " << fn << endl;
// extract this file to a temporary file
- const char *tmpdir_env = getenv ("TMPDIR") ?: "/tmp";
char* tmppath = NULL;
- rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir_env);
+ rc = asprintf (&tmppath, "%s/debuginfod.XXXXXX", tmpdir.c_str());
if (rc < 0)
throw libc_exception (ENOMEM, "cannot allocate tmppath");
defer_dtor<void*,void> tmmpath_freer (tmppath, free);
@@ -1978,6 +2438,26 @@ rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_up
.bind(2, s)
.step_ok_done();
+ // PR25548: also store canonicalized source path
+ const string& dwarfsrc = s;
+ string dwarfsrc_canon = canon_pathname (dwarfsrc);
+ if (dwarfsrc_canon != dwarfsrc)
+ {
+ if (verbose > 3)
+ obatched(clog) << "canonicalized src=" << dwarfsrc << " alias=" << dwarfsrc_canon << endl;
+
+ ps_upsert_files
+ .reset()
+ .bind(1, dwarfsrc_canon)
+ .step_ok_done();
+
+ ps_upsert_sref
+ .reset()
+ .bind(1, buildid)
+ .bind(2, dwarfsrc_canon)
+ .step_ok_done();
+ }
+
fts_sref ++;
}
}
@@ -2027,250 +2507,351 @@ rpm_classify (const string& rps, sqlite_ps& ps_upsert_buildids, sqlite_ps& ps_up
-// scan for *.rpm files
+// scan for archive files such as .rpm
static void
-scan_source_rpm_path (const string& dir)
+scan_archive_file (const string& rps, const stat_t& st,
+ sqlite_ps& ps_upsert_buildids,
+ sqlite_ps& ps_upsert_files,
+ sqlite_ps& ps_upsert_de,
+ sqlite_ps& ps_upsert_sref,
+ sqlite_ps& ps_upsert_sdef,
+ sqlite_ps& ps_query,
+ sqlite_ps& ps_scan_done,
+ unsigned& fts_cached,
+ unsigned& fts_executable,
+ unsigned& fts_debuginfo,
+ unsigned& fts_sref,
+ unsigned& fts_sdef)
+{
+ /* See if we know of it already. */
+ int rc = ps_query
+ .reset()
+ .bind(1, rps)
+ .bind(2, st.st_mtime)
+ .step();
+ ps_query.reset();
+ if (rc == SQLITE_ROW) // i.e., a result, as opposed to DONE (no results)
+ // no need to recheck a file/version we already know
+ // specifically, no need to parse this archive again, since we already have
+ // it as a D or E or S record,
+ // (so is stored with buildid=NULL)
+ {
+ fts_cached ++;
+ return;
+ }
+
+ // intern the archive file name
+ ps_upsert_files
+ .reset()
+ .bind(1, rps)
+ .step_ok_done();
+
+ // extract the archive contents
+ unsigned my_fts_executable = 0, my_fts_debuginfo = 0, my_fts_sref = 0, my_fts_sdef = 0;
+ bool my_fts_sref_complete_p = true;
+ try
+ {
+ string archive_extension;
+ archive_classify (rps, archive_extension,
+ ps_upsert_buildids, ps_upsert_files,
+ ps_upsert_de, ps_upsert_sref, ps_upsert_sdef, // dalt
+ st.st_mtime,
+ my_fts_executable, my_fts_debuginfo, my_fts_sref, my_fts_sdef,
+ my_fts_sref_complete_p);
+ inc_metric ("scanned_total","source",archive_extension + " archive");
+ add_metric("found_debuginfo_total","source",archive_extension + " archive",
+ my_fts_debuginfo);
+ add_metric("found_executable_total","source",archive_extension + " archive",
+ my_fts_executable);
+ add_metric("found_sourcerefs_total","source",archive_extension + " archive",
+ my_fts_sref);
+ }
+ catch (const reportable_exception& e)
+ {
+ e.report(clog);
+ }
+
+ if (verbose > 2)
+ obatched(clog) << "scanned archive=" << rps
+ << " mtime=" << st.st_mtime
+ << " executables=" << my_fts_executable
+ << " debuginfos=" << my_fts_debuginfo
+ << " srefs=" << my_fts_sref
+ << " sdefs=" << my_fts_sdef
+ << endl;
+
+ fts_executable += my_fts_executable;
+ fts_debuginfo += my_fts_debuginfo;
+ fts_sref += my_fts_sref;
+ fts_sdef += my_fts_sdef;
+
+ if (my_fts_sref_complete_p) // leave incomplete?
+ ps_scan_done
+ .reset()
+ .bind(1, rps)
+ .bind(2, st.st_mtime)
+ .bind(3, st.st_size)
+ .step_ok_done();
+}
+
+
+
+////////////////////////////////////////////////////////////////////////
+
+
+
+// The thread that consumes file names off of the scanq. We hold
+// the persistent sqlite_ps's at this level and delegate file/archive
+// scanning to other functions.
+static void*
+thread_main_scanner (void* arg)
{
- obatched(clog) << "fts/rpm traversing " << dir << endl;
+ (void) arg;
+
+ // all the prepared statements fit to use, the _f_ set:
+ sqlite_ps ps_f_upsert_buildids (db, "file-buildids-intern", "insert or ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
+ sqlite_ps ps_f_upsert_files (db, "file-files-intern", "insert or ignore into " BUILDIDS "_files VALUES (NULL, ?);");
+ sqlite_ps ps_f_upsert_de (db, "file-de-upsert",
+ "insert or ignore into " BUILDIDS "_f_de "
+ "(buildid, debuginfo_p, executable_p, file, mtime) "
+ "values ((select id from " BUILDIDS "_buildids where hex = ?),"
+ " ?,?,"
+ " (select id from " BUILDIDS "_files where name = ?), ?);");
+ sqlite_ps ps_f_upsert_s (db, "file-s-upsert",
+ "insert or ignore into " BUILDIDS "_f_s "
+ "(buildid, artifactsrc, file, mtime) "
+ "values ((select id from " BUILDIDS "_buildids where hex = ?),"
+ " (select id from " BUILDIDS "_files where name = ?),"
+ " (select id from " BUILDIDS "_files where name = ?),"
+ " ?);");
+ sqlite_ps ps_f_query (db, "file-negativehit-find",
+ "select 1 from " BUILDIDS "_file_mtime_scanned where sourcetype = 'F' "
+ "and file = (select id from " BUILDIDS "_files where name = ?) and mtime = ?;");
+ sqlite_ps ps_f_scan_done (db, "file-scanned",
+ "insert or ignore into " BUILDIDS "_file_mtime_scanned (sourcetype, file, mtime, size)"
+ "values ('F', (select id from " BUILDIDS "_files where name = ?), ?, ?);");
- sqlite_ps ps_upsert_buildids (db, "rpm-buildid-intern", "insert or ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
- sqlite_ps ps_upsert_files (db, "rpm-file-intern", "insert or ignore into " BUILDIDS "_files VALUES (NULL, ?);");
- sqlite_ps ps_upsert_de (db, "rpm-de-insert",
+ // and now for the _r_ set
+ sqlite_ps ps_r_upsert_buildids (db, "rpm-buildid-intern", "insert or ignore into " BUILDIDS "_buildids VALUES (NULL, ?);");
+ sqlite_ps ps_r_upsert_files (db, "rpm-file-intern", "insert or ignore into " BUILDIDS "_files VALUES (NULL, ?);");
+ sqlite_ps ps_r_upsert_de (db, "rpm-de-insert",
"insert or ignore into " BUILDIDS "_r_de (buildid, debuginfo_p, executable_p, file, mtime, content) values ("
"(select id from " BUILDIDS "_buildids where hex = ?), ?, ?, "
"(select id from " BUILDIDS "_files where name = ?), ?, "
"(select id from " BUILDIDS "_files where name = ?));");
- sqlite_ps ps_upsert_sref (db, "rpm-sref-insert",
+ sqlite_ps ps_r_upsert_sref (db, "rpm-sref-insert",
"insert or ignore into " BUILDIDS "_r_sref (buildid, artifactsrc) values ("
"(select id from " BUILDIDS "_buildids where hex = ?), "
"(select id from " BUILDIDS "_files where name = ?));");
- sqlite_ps ps_upsert_sdef (db, "rpm-sdef-insert",
+ sqlite_ps ps_r_upsert_sdef (db, "rpm-sdef-insert",
"insert or ignore into " BUILDIDS "_r_sdef (file, mtime, content) values ("
"(select id from " BUILDIDS "_files where name = ?), ?,"
"(select id from " BUILDIDS "_files where name = ?));");
- sqlite_ps ps_query (db, "rpm-negativehit-query",
+ sqlite_ps ps_r_query (db, "rpm-negativehit-query",
"select 1 from " BUILDIDS "_file_mtime_scanned where "
"sourcetype = 'R' and file = (select id from " BUILDIDS "_files where name = ?) and mtime = ?;");
- sqlite_ps ps_scan_done (db, "rpm-scanned",
+ sqlite_ps ps_r_scan_done (db, "rpm-scanned",
"insert or ignore into " BUILDIDS "_file_mtime_scanned (sourcetype, file, mtime, size)"
"values ('R', (select id from " BUILDIDS "_files where name = ?), ?, ?);");
- char * const dirs[] = { (char*) dir.c_str(), NULL };
- struct timeval tv_start, tv_end;
- gettimeofday (&tv_start, NULL);
- unsigned fts_scanned=0, fts_regex=0, fts_cached=0, fts_debuginfo=0;
- unsigned fts_executable=0, fts_rpm = 0, fts_sref=0, fts_sdef=0;
+ unsigned fts_cached = 0, fts_executable = 0, fts_debuginfo = 0, fts_sourcefiles = 0;
+ unsigned fts_sref = 0, fts_sdef = 0;
- FTS *fts = fts_open (dirs,
- (traverse_logical ? FTS_LOGICAL : FTS_PHYSICAL|FTS_XDEV)
- | FTS_NOCHDIR /* multithreaded */,
- NULL);
- if (fts == NULL)
- {
- obatched(cerr) << "cannot fts_open " << dir << endl;
- return;
- }
-
- FTSENT *f;
- while ((f = fts_read (fts)) != NULL)
+ add_metric("thread_count", "role", "scan", 1);
+ add_metric("thread_busy", "role", "scan", 1);
+ while (! interrupted)
{
- semaphore_borrower handle_one_file (scan_concurrency_sem);
-
- fts_scanned ++;
- if (interrupted)
- break;
+ scan_payload p;
- if (verbose > 2)
- obatched(clog) << "fts/rpm traversing " << f->fts_path << endl;
+ add_metric("thread_busy", "role", "scan", -1);
+ bool gotone = scanq.wait_front(p);
+ add_metric("thread_busy", "role", "scan", 1);
+ if (! gotone) continue; // or break
try
{
- /* Found a file. Convert it to an absolute path, so
- the buildid database does not have relative path
- names that are unresolvable from a subsequent run
- in a different cwd. */
- char *rp = realpath(f->fts_path, NULL);
- if (rp == NULL)
- continue; // ignore dangling symlink or such
- string rps = string(rp);
- free (rp);
-
- bool ri = !regexec (&file_include_regex, rps.c_str(), 0, 0, 0);
- bool rx = !regexec (&file_exclude_regex, rps.c_str(), 0, 0, 0);
- if (!ri || rx)
- {
- if (verbose > 3)
- obatched(clog) << "fts/rpm skipped by regex " << (!ri ? "I" : "") << (rx ? "X" : "") << endl;
- fts_regex ++;
- continue;
- }
+ bool scan_archive = false;
+ for (auto&& arch : scan_archives)
+ if (string_endswith(p.first, arch.first))
+ scan_archive = true;
+
+ if (scan_archive)
+ scan_archive_file (p.first, p.second,
+ ps_r_upsert_buildids,
+ ps_r_upsert_files,
+ ps_r_upsert_de,
+ ps_r_upsert_sref,
+ ps_r_upsert_sdef,
+ ps_r_query,
+ ps_r_scan_done,
+ fts_cached,
+ fts_executable,
+ fts_debuginfo,
+ fts_sref,
+ fts_sdef);
+
+ if (scan_files) // NB: maybe "else if" ?
+ scan_source_file (p.first, p.second,
+ ps_f_upsert_buildids,
+ ps_f_upsert_files,
+ ps_f_upsert_de,
+ ps_f_upsert_s,
+ ps_f_query,
+ ps_f_scan_done,
+ fts_cached, fts_executable, fts_debuginfo, fts_sourcefiles);
+ }
+ catch (const reportable_exception& e)
+ {
+ e.report(cerr);
+ }
- switch (f->fts_info)
- {
- case FTS_D:
- break;
+ inc_metric("thread_work_total", "role","scan");
+ }
- case FTS_DP:
- break;
+ add_metric("thread_busy", "role", "scan", -1);
+ return 0;
+}
- case FTS_F:
- {
- // heuristic: reject if file name does not end with ".rpm"
- // (alternative: try opening with librpm etc., caching)
- string suffix = ".rpm";
- if (rps.size() < suffix.size() ||
- rps.substr(rps.size()-suffix.size()) != suffix)
- continue;
- fts_rpm ++;
-
- /* See if we know of it already. */
- int rc = ps_query
- .reset()
- .bind(1, rps)
- .bind(2, f->fts_statp->st_mtime)
- .step();
- ps_query.reset();
- if (rc == SQLITE_ROW) // i.e., a result, as opposed to DONE (no results)
- // no need to recheck a file/version we already know
- // specifically, no need to parse this rpm again, since we already have
- // it as a D or E or S record,
- // (so is stored with buildid=NULL)
- {
- fts_cached ++;
- continue;
- }
-
- // intern the rpm file name
- ps_upsert_files
- .reset()
- .bind(1, rps)
- .step_ok_done();
-
- // extract the rpm contents via popen("rpm2cpio") | libarchive | loop-of-elf_classify()
- unsigned my_fts_executable = 0, my_fts_debuginfo = 0, my_fts_sref = 0, my_fts_sdef = 0;
- bool my_fts_sref_complete_p = true;
- try
- {
- rpm_classify (rps,
- ps_upsert_buildids, ps_upsert_files,
- ps_upsert_de, ps_upsert_sref, ps_upsert_sdef, // dalt
- f->fts_statp->st_mtime,
- my_fts_executable, my_fts_debuginfo, my_fts_sref, my_fts_sdef,
- my_fts_sref_complete_p);
- inc_metric ("scanned_total","source","rpm");
- add_metric("found_debuginfo_total","source","rpm",
- my_fts_debuginfo);
- add_metric("found_executable_total","source","rpm",
- my_fts_executable);
- add_metric("found_sourcerefs_total","source","rpm",
- my_fts_sref);
- }
- catch (const reportable_exception& e)
- {
- e.report(clog);
- }
-
- if (verbose > 2)
- obatched(clog) << "scanned rpm=" << rps
- << " mtime=" << f->fts_statp->st_mtime
- << " executables=" << my_fts_executable
- << " debuginfos=" << my_fts_debuginfo
- << " srefs=" << my_fts_sref
- << " sdefs=" << my_fts_sdef
- << endl;
-
- fts_executable += my_fts_executable;
- fts_debuginfo += my_fts_debuginfo;
- fts_sref += my_fts_sref;
- fts_sdef += my_fts_sdef;
-
- if (my_fts_sref_complete_p) // leave incomplete?
- ps_scan_done
- .reset()
- .bind(1, rps)
- .bind(2, f->fts_statp->st_mtime)
- .bind(3, f->fts_statp->st_size)
- .step_ok_done();
- }
- break;
- case FTS_ERR:
- case FTS_NS:
- throw libc_exception(f->fts_errno, string("fts/rpm traversal ") + string(f->fts_path));
- default:
- case FTS_SL: /* ignore symlinks; seen in non-L mode only */
- break;
- }
+// The thread that traverses all the source_paths and enqueues all the
+// matching files into the file/archive scan queue.
+static void
+scan_source_paths()
+{
+ // NB: fedora 31 glibc/fts(3) crashes inside fts_read() on empty
+ // path list.
+ if (source_paths.empty())
+ return;
- if ((verbose && f->fts_info == FTS_DP) ||
- (verbose > 1 && f->fts_info == FTS_F))
- obatched(clog) << "fts/rpm traversing " << rps << ", scanned=" << fts_scanned
- << ", regex-skipped=" << fts_regex
- << ", rpm=" << fts_rpm << ", cached=" << fts_cached << ", debuginfo=" << fts_debuginfo
- << ", executable=" << fts_executable
- << ", sourcerefs=" << fts_sref << ", sourcedefs=" << fts_sdef << endl;
- }
- catch (const reportable_exception& e)
+ // Turn the source_paths into an fts(3)-compatible char**. Since
+ // source_paths[] does not change after argv processing, the
+ // c_str()'s are safe to keep around awile.
+ vector<const char *> sps;
+ for (auto&& sp: source_paths)
+ sps.push_back(sp.c_str());
+ sps.push_back(NULL);
+
+ FTS *fts = fts_open ((char * const *)sps.data(),
+ (traverse_logical ? FTS_LOGICAL : FTS_PHYSICAL|FTS_XDEV)
+ | FTS_NOCHDIR /* multithreaded */,
+ NULL);
+ if (fts == NULL)
+ throw libc_exception(errno, "cannot fts_open");
+ defer_dtor<FTS*,int> fts_cleanup (fts, fts_close);
+
+ struct timeval tv_start, tv_end;
+ gettimeofday (&tv_start, NULL);
+ unsigned fts_scanned = 0, fts_regex = 0;
+
+ FTSENT *f;
+ while ((f = fts_read (fts)) != NULL)
+ {
+ if (interrupted) break;
+
+ fts_scanned ++;
+
+ if (verbose > 2)
+ obatched(clog) << "fts traversing " << f->fts_path << endl;
+
+ /* Found a file. Convert it to an absolute path, so
+ the buildid database does not have relative path
+ names that are unresolvable from a subsequent run
+ in a different cwd. */
+ char *rp = realpath(f->fts_path, NULL);
+ if (rp == NULL)
+ continue; // ignore dangling symlink or such
+ string rps = string(rp);
+ free (rp);
+
+ bool ri = !regexec (&file_include_regex, rps.c_str(), 0, 0, 0);
+ bool rx = !regexec (&file_exclude_regex, rps.c_str(), 0, 0, 0);
+ if (!ri || rx)
+ {
+ if (verbose > 3)
+ obatched(clog) << "fts skipped by regex " << (!ri ? "I" : "") << (rx ? "X" : "") << endl;
+ fts_regex ++;
+ continue;
+ }
+
+ switch (f->fts_info)
+ {
+ case FTS_F:
+ scanq.push_back (make_pair(rps, *f->fts_statp));
+ break;
+
+ case FTS_ERR:
+ case FTS_NS:
+ // report on some types of errors because they may reflect fixable misconfiguration
{
- e.report(clog);
+ auto x = libc_exception(f->fts_errno, string("fts traversal ") + string(f->fts_path));
+ x.report(cerr);
}
- }
- fts_close (fts);
+ break;
+ default:
+ ;
+ /* ignore */
+ }
+ }
gettimeofday (&tv_end, NULL);
double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
- obatched(clog) << "fts/rpm traversed " << dir << " in " << deltas << "s, scanned=" << fts_scanned
- << ", regex-skipped=" << fts_regex
- << ", rpm=" << fts_rpm << ", cached=" << fts_cached << ", debuginfo=" << fts_debuginfo
- << ", executable=" << fts_executable
- << ", sourcerefs=" << fts_sref << ", sourcedefs=" << fts_sdef << endl;
+ obatched(clog) << "fts traversed source paths in " << deltas << "s, scanned=" << fts_scanned
+ << ", regex-skipped=" << fts_regex << endl;
}
-
static void*
-thread_main_scan_source_rpm_path (void* arg)
+thread_main_fts_source_paths (void* arg)
{
- string dir = string((const char*) arg);
+ (void) arg; // ignore; we operate on global data
- unsigned rescan_timer = 0;
sig_atomic_t forced_rescan_count = 0;
- set_metric("thread_timer_max", "rpm", dir, rescan_s);
- set_metric("thread_tid", "rpm", dir, tid());
+ set_metric("thread_tid", "role","traverse", tid());
+ add_metric("thread_count", "role", "traverse", 1);
+
+ time_t last_rescan = 0;
+
while (! interrupted)
{
- set_metric("thread_timer", "rpm", dir, rescan_timer);
- set_metric("thread_forced_total", "rpm", dir, forced_rescan_count);
- if (rescan_s && rescan_timer > rescan_s)
- rescan_timer = 0;
+ sleep (1);
+ scanq.wait_idle(); // don't start a new traversal while scanners haven't finished the job
+ scanq.done_idle(); // release the hounds
+ if (interrupted) break;
+
+ time_t now = time(NULL);
+ bool rescan_now = false;
+ if (last_rescan == 0) // at least one initial rescan is documented even for -t0
+ rescan_now = true;
+ if (rescan_s > 0 && (long)now > (long)(last_rescan + rescan_s))
+ rescan_now = true;
if (sigusr1 != forced_rescan_count)
{
forced_rescan_count = sigusr1;
- rescan_timer = 0;
+ rescan_now = true;
}
- if (rescan_timer == 0)
+ if (rescan_now)
try
{
- set_metric("thread_working", "rpm", dir, time(NULL));
- inc_metric("thread_work_total", "rpm", dir);
- scan_source_rpm_path (dir);
- set_metric("thread_working", "rpm", dir, 0);
+ set_metric("thread_busy", "role","traverse", 1);
+ scan_source_paths();
+ last_rescan = time(NULL); // NB: now was before scanning
+ inc_metric("thread_work_total", "role","traverse");
+ set_metric("thread_busy", "role","traverse", 0);
}
- catch (const sqlite_exception& e)
+ catch (const reportable_exception& e)
{
- obatched(cerr) << e.message << endl;
+ e.report(cerr);
}
- sleep (1);
- rescan_timer ++;
}
return 0;
}
+
////////////////////////////////////////////////////////////////////////
static void
@@ -2359,6 +2940,9 @@ void groom()
sqlite3_db_release_memory(db); // shrink the process if possible
+ fdcache.limit(0,0); // release the fdcache contents
+ fdcache.limit(fdcache_fds,fdcache_mbs); // restore status quo parameters
+
gettimeofday (&tv_end, NULL);
double deltas = (tv_end.tv_sec - tv_start.tv_sec) + (tv_end.tv_usec - tv_start.tv_usec)*0.000001;
@@ -2369,35 +2953,44 @@ void groom()
static void*
thread_main_groom (void* /*arg*/)
{
- unsigned groom_timer = 0;
sig_atomic_t forced_groom_count = 0;
- set_metric("thread_timer_max", "role", "groom", groom_s);
set_metric("thread_tid", "role", "groom", tid());
- while (! interrupted)
+ add_metric("thread_count", "role", "groom", 1);
+
+ time_t last_groom = 0;
+
+ while (1)
{
- set_metric("thread_timer", "role", "groom", groom_timer);
- set_metric("thread_forced_total", "role", "groom", forced_groom_count);
- if (groom_s && groom_timer > groom_s)
- groom_timer = 0;
+ sleep (1);
+ scanq.wait_idle(); // PR25394: block scanners during grooming!
+ if (interrupted) break;
+
+ time_t now = time(NULL);
+ bool groom_now = false;
+ if (last_groom == 0) // at least one initial groom is documented even for -g0
+ groom_now = true;
+ if (groom_s > 0 && (long)now > (long)(last_groom + groom_s))
+ groom_now = true;
if (sigusr2 != forced_groom_count)
{
forced_groom_count = sigusr2;
- groom_timer = 0;
+ groom_now = true;
}
- if (groom_timer == 0)
+ if (groom_now)
try
{
- set_metric("thread_working", "role", "groom", time(NULL));
- inc_metric("thread_work_total", "role", "groom");
+ set_metric("thread_busy", "role", "groom", 1);
groom ();
- set_metric("thread_working", "role", "groom", 0);
+ last_groom = time(NULL); // NB: now was before grooming
+ inc_metric("thread_work_total", "role", "groom");
+ set_metric("thread_busy", "role", "groom", 0);
}
catch (const sqlite_exception& e)
{
obatched(cerr) << e.message << endl;
}
- sleep (1);
- groom_timer ++;
+
+ scanq.done_idle();
}
return 0;
@@ -2473,6 +3066,8 @@ main (int argc, char *argv[])
/* Tell the library which version we are expecting. */
elf_version (EV_CURRENT);
+ tmpdir = string(getenv("TMPDIR") ?: "/tmp");
+
/* Set computed default values. */
db_path = string(getenv("HOME") ?: "/") + string("/.debuginfod.sqlite"); /* XDG? */
int rc = regcomp (& file_include_regex, ".*", REG_EXTENDED|REG_NOSUB); // match everything
@@ -2482,6 +3077,16 @@ main (int argc, char *argv[])
if (rc != 0)
error (EXIT_FAILURE, 0, "regcomp failure: %d", rc);
+ // default parameters for fdcache are computed from system stats
+ struct statfs sfs;
+ rc = statfs(tmpdir.c_str(), &sfs);
+ if (rc < 0)
+ fdcache_mbs = 1024; // 1 gigabyte
+ else
+ fdcache_mbs = sfs.f_bavail * sfs.f_bsize / 1024 / 1024 / 4; // 25% of free space
+ fdcache_prefetch = 64; // guesstimate storage is this much less costly than re-decompression
+ fdcache_fds = (concurrency + fdcache_prefetch) * 2;
+
/* Parse and process arguments. */
int remaining;
argp_program_version_hook = print_version; // this works
@@ -2490,8 +3095,10 @@ main (int argc, char *argv[])
error (EXIT_FAILURE, 0,
"unexpected argument: %s", argv[remaining]);
- if (!scan_rpms && !scan_files && source_paths.size()>0)
- obatched(clog) << "warning: without -F and/or -R, ignoring PATHs" << endl;
+ if (scan_archives.size()==0 && !scan_files && source_paths.size()>0)
+ obatched(clog) << "warning: without -F -R -U -Z, ignoring PATHs" << endl;
+
+ fdcache.limit(fdcache_fds, fdcache_mbs);
(void) signal (SIGPIPE, SIG_IGN); // microhttpd can generate it incidentally, ignore
(void) signal (SIGINT, signal_handler); // ^C
@@ -2500,9 +3107,6 @@ main (int argc, char *argv[])
(void) signal (SIGUSR1, sigusr1_handler); // end-user
(void) signal (SIGUSR2, sigusr2_handler); // end-user
- // do this before any threads start
- scan_concurrency_sem = new semaphore(concurrency);
-
/* Get database ready. */
rc = sqlite3_open_v2 (db_path.c_str(), &db, (SQLITE_OPEN_READWRITE
|SQLITE_OPEN_CREATE
@@ -2611,64 +3215,68 @@ main (int argc, char *argv[])
if (maxigroom)
obatched(clog) << "maxigroomed database" << endl;
-
obatched(clog) << "search concurrency " << concurrency << endl;
obatched(clog) << "rescan time " << rescan_s << endl;
+ obatched(clog) << "fdcache fds " << fdcache_fds << endl;
+ obatched(clog) << "fdcache mbs " << fdcache_mbs << endl;
+ obatched(clog) << "fdcache prefetch " << fdcache_prefetch << endl;
+ obatched(clog) << "fdcache tmpdir " << tmpdir << endl;
obatched(clog) << "groom time " << groom_s << endl;
+ if (scan_archives.size()>0)
+ {
+ obatched ob(clog);
+ auto& o = ob << "scanning archive types ";
+ for (auto&& arch : scan_archives)
+ o << arch.first << "(" << arch.second << ") ";
+ o << endl;
+ }
const char* du = getenv(DEBUGINFOD_URLS_ENV_VAR);
if (du && du[0] != '\0') // set to non-empty string?
obatched(clog) << "upstream debuginfod servers: " << du << endl;
- vector<pthread_t> source_file_scanner_threads;
- vector<pthread_t> source_rpm_scanner_threads;
- pthread_t groom_thread;
+ vector<pthread_t> all_threads;
- rc = pthread_create (& groom_thread, NULL, thread_main_groom, NULL);
+ pthread_t pt;
+ rc = pthread_create (& pt, NULL, thread_main_groom, NULL);
if (rc < 0)
error (0, 0, "warning: cannot spawn thread (%d) to groom database\n", rc);
-
- if (scan_files) for (auto&& it : source_paths)
- {
- pthread_t pt;
- rc = pthread_create (& pt, NULL, thread_main_scan_source_file_path, (void*) it.c_str());
- if (rc < 0)
- error (0, 0, "warning: cannot spawn thread (%d) to scan source files %s\n", rc, it.c_str());
- else
- source_file_scanner_threads.push_back(pt);
- }
+ else
+ all_threads.push_back(pt);
- if (scan_rpms) for (auto&& it : source_paths)
+ if (scan_files || scan_archives.size() > 0)
{
- pthread_t pt;
- rc = pthread_create (& pt, NULL, thread_main_scan_source_rpm_path, (void*) it.c_str());
+ pthread_create (& pt, NULL, thread_main_fts_source_paths, NULL);
if (rc < 0)
- error (0, 0, "warning: cannot spawn thread (%d) to scan source rpms %s\n", rc, it.c_str());
- else
- source_rpm_scanner_threads.push_back(pt);
+ error (0, 0, "warning: cannot spawn thread (%d) to traverse source paths\n", rc);
+ all_threads.push_back(pt);
+ for (unsigned i=0; i<concurrency; i++)
+ {
+ pthread_create (& pt, NULL, thread_main_scanner, NULL);
+ if (rc < 0)
+ error (0, 0, "warning: cannot spawn thread (%d) to scan source files / archives\n", rc);
+ all_threads.push_back(pt);
+ }
}
/* Trivial main loop! */
set_metric("ready", 1);
while (! interrupted)
pause ();
+ scanq.nuke(); // wake up any remaining scanq-related threads, let them die
set_metric("ready", 0);
if (verbose)
obatched(clog) << "stopping" << endl;
- /* Join any source scanning threads. */
- for (auto&& it : source_file_scanner_threads)
- pthread_join (it, NULL);
- for (auto&& it : source_rpm_scanner_threads)
+ /* Join all our threads. */
+ for (auto&& it : all_threads)
pthread_join (it, NULL);
- pthread_join (groom_thread, NULL);
-
+
/* Stop all the web service threads. */
if (d4) MHD_stop_daemon (d4);
if (d6) MHD_stop_daemon (d6);
/* With all threads known dead, we can clean up the global resources. */
- delete scan_concurrency_sem;
rc = sqlite3_exec (db, DEBUGINFOD_SQLITE_CLEANUP_DDL, NULL, NULL, NULL);
if (rc != SQLITE_OK)
{