diff options
Diffstat (limited to 'debuginfod/debuginfod.cxx')
-rw-r--r-- | debuginfod/debuginfod.cxx | 319 |
1 files changed, 203 insertions, 116 deletions
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx index 76f1fa52..3085f388 100644 --- a/debuginfod/debuginfod.cxx +++ b/debuginfod/debuginfod.cxx @@ -92,6 +92,14 @@ using namespace std; #include <libdwelf.h> #include <microhttpd.h> + +#if MHD_VERSION >= 0x00097002 +// libmicrohttpd 0.9.71 broke API +#define MHD_RESULT enum MHD_Result +#else +#define MHD_RESULT int +#endif + #include <curl/curl.h> #include <archive.h> #include <archive_entry.h> @@ -380,7 +388,9 @@ static string db_path; static sqlite3 *db; // single connection, serialized across all our threads! static unsigned verbose; static volatile sig_atomic_t interrupted = 0; +static volatile sig_atomic_t forced_rescan_count = 0; static volatile sig_atomic_t sigusr1 = 0; +static volatile sig_atomic_t forced_groom_count = 0; static volatile sig_atomic_t sigusr2 = 0; static unsigned http_port = 8002; static unsigned rescan_s = 300; @@ -519,12 +529,12 @@ struct reportable_exception void report(ostream& o) const; // defined under obatched() class below - int mhd_send_response(MHD_Connection* c) const { + MHD_RESULT mhd_send_response(MHD_Connection* c) const { MHD_Response* r = MHD_create_response_from_buffer (message.size(), (void*) message.c_str(), MHD_RESPMEM_MUST_COPY); MHD_add_response_header (r, "Content-Type", "text/plain"); - int rc = MHD_queue_response (c, code, r); + MHD_RESULT rc = MHD_queue_response (c, code, r); MHD_destroy_response (r); return rc; } @@ -540,23 +550,31 @@ struct sqlite_exception: public reportable_exception struct libc_exception: public reportable_exception { libc_exception(int rc, const string& msg): - reportable_exception(string("libc error: ") + msg + ": " + string(strerror(rc) ?: "?")) {} + reportable_exception(string("libc error: ") + msg + ": " + string(strerror(rc) ?: "?")) { + inc_metric("error_count","libc",strerror(rc)); + } }; struct archive_exception: public reportable_exception { archive_exception(const string& msg): - reportable_exception(string("libarchive error: ") + msg) {} + reportable_exception(string("libarchive error: ") + msg) { + inc_metric("error_count","libarchive",msg); + } archive_exception(struct archive* a, const string& msg): - reportable_exception(string("libarchive error: ") + msg + ": " + string(archive_error_string(a) ?: "?")) {} + reportable_exception(string("libarchive error: ") + msg + ": " + string(archive_error_string(a) ?: "?")) { + inc_metric("error_count","libarchive",msg); + } }; struct elfutils_exception: public reportable_exception { elfutils_exception(int rc, const string& msg): - reportable_exception(string("elfutils error: ") + msg + ": " + string(elf_errmsg(rc) ?: "?")) {} + reportable_exception(string("elfutils error: ") + msg + ": " + string(elf_errmsg(rc) ?: "?")) { + inc_metric("error_count","elfutils",elf_errmsg(rc)); + } }; @@ -591,6 +609,14 @@ public: cv.notify_all(); } + // clear the workqueue, when scanning is interrupted with USR2 + void clear() { + unique_lock<mutex> lock(mtx); + q.clear(); + set_metric("thread_work_pending","role","scan", q.size()); + cv.notify_all(); // maybe wake up waiting idlers + } + // block this scanner thread until there is work to do and no active bool wait_front (Payload& p) { @@ -882,18 +908,15 @@ add_mhd_last_modified (struct MHD_Response *resp, time_t mtime) static struct MHD_Response* -handle_buildid_f_match (int64_t b_mtime, +handle_buildid_f_match (bool internal_req_t, + int64_t b_mtime, const string& b_source0, int *result_fd) { + (void) internal_req_t; // ignored int fd = open(b_source0.c_str(), O_RDONLY); if (fd < 0) - { - if (verbose) - obatched(clog) << "cannot open " << b_source0 << endl; - // if still missing, a periodic groom pass will delete this buildid record - return 0; - } + throw libc_exception (errno, string("open ") + b_source0); // NB: use manual close(2) in error case instead of defer_dtor, because // in the normal case, we want to hand the fd over to libmicrohttpd for @@ -903,10 +926,8 @@ handle_buildid_f_match (int64_t b_mtime, int rc = fstat(fd, &s); if (rc < 0) { - if (verbose) - clog << "cannot fstat " << b_source0 << endl; close(fd); - return 0; + throw libc_exception (errno, string("fstat ") + b_source0); } if ((int64_t) s.st_mtime != b_mtime) @@ -1075,6 +1096,15 @@ private: long max_mbs; public: + void set_metrics() + { + double total_mb = 0.0; + for (auto i = lru.begin(); i < lru.end(); i++) + total_mb += i->fd_size_mb; + set_metric("fdcache_bytes", (int64_t)(total_mb*1024.0*1024.0)); + set_metric("fdcache_count", lru.size()); + } + void intern(const string& a, const string& b, string fd, off_t sz, bool front_p) { { @@ -1085,19 +1115,27 @@ public: { unlink (i->fd.c_str()); lru.erase(i); + inc_metric("fdcache_op_count","op","dequeue"); break; // must not continue iterating } } double mb = (sz+65535)/1048576.0; // round up to 64K block fdcache_entry n = { a, b, fd, mb }; if (front_p) - lru.push_front(n); + { + inc_metric("fdcache_op_count","op","enqueue_front"); + lru.push_front(n); + } else - lru.push_back(n); - if (verbose > 3) - obatched(clog) << "fdcache interned a=" << a << " b=" << b - << " fd=" << fd << " mb=" << mb << " front=" << front_p << endl; + { + inc_metric("fdcache_op_count","op","enqueue_back"); + lru.push_back(n); + } + if (verbose > 3) + obatched(clog) << "fdcache interned a=" << a << " b=" << b + << " fd=" << fd << " mb=" << mb << " front=" << front_p << endl; } + set_metrics(); // NB: we age the cache at lookup time too if (front_p) @@ -1116,7 +1154,7 @@ public: fdcache_entry n = *i; lru.erase(i); // invalidates i, so no more iteration! lru.push_front(n); - + inc_metric("fdcache_op_count","op","requeue_front"); fd = open(n.fd.c_str(), O_RDONLY); // NB: no problem if dup() fails; looks like cache miss break; } @@ -1135,8 +1173,12 @@ public: for (auto i = lru.begin(); i < lru.end(); i++) { if (i->archive == a && i->entry == b) - return true; + { + inc_metric("fdcache_op_count","op","probe_hit"); + return true; + } } + inc_metric("fdcache_op_count","op","probe_miss"); return false; } @@ -1149,13 +1191,15 @@ public: { // found it; move it to head of lru fdcache_entry n = *i; lru.erase(i); // invalidates i, so no more iteration! + inc_metric("fdcache_op_count","op","clear"); unlink (n.fd.c_str()); + set_metrics(); return; } } } - void limit(long maxfds, long maxmbs) + void limit(long maxfds, long maxmbs, bool metrics_p = true) { if (verbose > 3 && (this->max_fds != maxfds || this->max_mbs != maxmbs)) obatched(clog) << "fdcache limited to maxfds=" << maxfds << " maxmbs=" << maxmbs << endl; @@ -1180,19 +1224,23 @@ public: if (verbose > 3) obatched(clog) << "fdcache evicted a=" << j->archive << " b=" << j->entry << " fd=" << j->fd << " mb=" << j->fd_size_mb << endl; + if (metrics_p) + inc_metric("fdcache_op_count","op","evict"); unlink (j->fd.c_str()); } lru.erase(i, lru.end()); // erase the nodes generally break; } - } + if (metrics_p) set_metrics(); } ~libarchive_fdcache() { - limit(0, 0); + // unlink any fdcache entries in $TMPDIR + // don't update metrics; those globals may be already destroyed + limit(0, 0, false); } }; static libarchive_fdcache fdcache; @@ -1218,7 +1266,8 @@ string canonicalized_archive_entry_pathname(struct archive_entry *e) static struct MHD_Response* -handle_buildid_r_match (int64_t b_mtime, +handle_buildid_r_match (bool internal_req_p, + int64_t b_mtime, const string& b_source0, const string& b_source1, int *result_fd) @@ -1322,7 +1371,8 @@ handle_buildid_r_match (int64_t b_mtime, // 3) extract some number of prefetched entries (just into fdcache) // 4) abort any further processing struct MHD_Response* r = 0; // will set in stage 2 - unsigned prefetch_count = fdcache_prefetch; // will decrement in stage 3 + unsigned prefetch_count = + internal_req_p ? 0 : fdcache_prefetch; // will decrement in stage 3 while(r == 0 || prefetch_count > 0) // stage 1, 2, or 3 { @@ -1355,7 +1405,8 @@ handle_buildid_r_match (int64_t b_mtime, throw libc_exception (errno, "cannot create temporary file"); // NB: don't unlink (tmppath), as fdcache will take charge of it. - rc = archive_read_data_into_fd (a, fd); + // NB: this can take many uninterruptible seconds for a huge file + rc = archive_read_data_into_fd (a, fd); if (rc != ARCHIVE_OK) // e.g. ENOSPC! { close (fd); @@ -1416,18 +1467,28 @@ handle_buildid_r_match (int64_t b_mtime, static struct MHD_Response* -handle_buildid_match (int64_t b_mtime, +handle_buildid_match (bool internal_req_p, + int64_t b_mtime, const string& b_stype, const string& b_source0, const string& b_source1, int *result_fd) { - if (b_stype == "F") - return handle_buildid_f_match(b_mtime, b_source0, result_fd); - else if (b_stype == "R") - return handle_buildid_r_match(b_mtime, b_source0, b_source1, result_fd); - else - return 0; + try + { + if (b_stype == "F") + return handle_buildid_f_match(internal_req_p, b_mtime, b_source0, result_fd); + else if (b_stype == "R") + return handle_buildid_r_match(internal_req_p, b_mtime, b_source0, b_source1, result_fd); + } + catch (const reportable_exception &e) + { + e.report(clog); + // Report but swallow libc etc. errors here; let the caller + // iterate to other matches of the content. + } + + return 0; } @@ -1497,6 +1558,8 @@ handle_buildid (MHD_Connection* conn, "order by sharedprefix(source0,source0ref) desc, mtime desc"); pp->reset(); pp->bind(1, buildid); + // NB: we don't store the non-canonicalized path names any more, but old databases + // might have them (and no canon ones), so we keep searching for both. pp->bind(2, suffix); pp->bind(3, canon_pathname(suffix)); } @@ -1521,7 +1584,8 @@ handle_buildid (MHD_Connection* conn, // Try accessing the located match. // XXX: in case of multiple matches, attempt them in parallel? - auto r = handle_buildid_match (b_mtime, b_stype, b_source0, b_source1, result_fd); + auto r = handle_buildid_match (conn ? false : true, + b_mtime, b_stype, b_source0, b_source1, result_fd); if (r) return r; } @@ -1622,6 +1686,8 @@ handle_buildid (MHD_Connection* conn, static map<string,int64_t> metrics; // arbitrary data for /metrics query // NB: store int64_t since all our metrics are integers; prometheus accepts double static mutex metrics_lock; +// NB: these objects get released during the process exit via global dtors +// do not call them from within other global dtors // utility function for assembling prometheus-compatible // name="escaped-value" strings @@ -1723,7 +1789,7 @@ handle_metrics (off_t* size) /* libmicrohttpd callback */ -static int +static MHD_RESULT handler_cb (void * /*cls*/, struct MHD_Connection *connection, const char *url, @@ -1736,7 +1802,11 @@ handler_cb (void * /*cls*/, struct MHD_Response *r = NULL; string url_copy = url; +#if MHD_VERSION >= 0x00097002 + enum MHD_Result rc; +#else int rc = MHD_NO; // mhd +#endif int http_code = 500; off_t http_size = -1; struct timeval tv_start, tv_end; @@ -2159,7 +2229,9 @@ scan_source_file (const string& rps, const stat_t& st, elf_classify (fd, executable_p, debuginfo_p, buildid, sourcefiles); else throw libc_exception(errno, string("open ") + rps); - inc_metric ("scanned_total","source","file"); + add_metric ("scanned_bytes_total","source","file", + st.st_size); + inc_metric ("scanned_files_total","source","file"); } // NB: we catch exceptions here too, so that we can // cache the corrupt-elf case (!executable_p && @@ -2242,41 +2314,27 @@ scan_source_file (const string& rps, const stat_t& st, .bind(1, srps) .step_ok_done(); - // register the dwarfsrc name in the interning table too + // PR25548: store canonicalized dwarfsrc path + string dwarfsrc_canon = canon_pathname (dwarfsrc); + if (dwarfsrc_canon != dwarfsrc) + { + if (verbose > 3) + obatched(clog) << "canonicalized src=" << dwarfsrc << " alias=" << dwarfsrc_canon << endl; + } + ps_upsert_files .reset() - .bind(1, dwarfsrc) + .bind(1, dwarfsrc_canon) .step_ok_done(); ps_upsert_s .reset() .bind(1, buildid) - .bind(2, dwarfsrc) + .bind(2, dwarfsrc_canon) .bind(3, srps) .bind(4, sfs.st_mtime) .step_ok_done(); - // PR25548: also store canonicalized source path - string dwarfsrc_canon = canon_pathname (dwarfsrc); - if (dwarfsrc_canon != dwarfsrc) - { - if (verbose > 3) - obatched(clog) << "canonicalized src=" << dwarfsrc << " alias=" << dwarfsrc_canon << endl; - - ps_upsert_files - .reset() - .bind(1, dwarfsrc_canon) - .step_ok_done(); - - ps_upsert_s - .reset() - .bind(1, buildid) - .bind(2, dwarfsrc_canon) - .bind(3, srps) - .bind(4, sfs.st_mtime) - .step_ok_done(); - } - inc_metric("found_sourcerefs_total","source","files"); } } @@ -2356,9 +2414,12 @@ archive_classify (const string& rps, string& archive_extension, if (verbose > 3) obatched(clog) << "libarchive scanning " << rps << endl; - while(1) // parse cpio archive entries + while(1) // parse archive entries { - try + if (interrupted) + break; + + try { struct archive_entry *e; rc = archive_read_next_header (a, &e); @@ -2427,36 +2488,25 @@ archive_classify (const string& rps, string& archive_extension, continue; } - ps_upsert_files - .reset() - .bind(1, s) - .step_ok_done(); - - ps_upsert_sref - .reset() - .bind(1, buildid) - .bind(2, s) - .step_ok_done(); - - // PR25548: also store canonicalized source path + // PR25548: store canonicalized source path const string& dwarfsrc = s; string dwarfsrc_canon = canon_pathname (dwarfsrc); if (dwarfsrc_canon != dwarfsrc) { if (verbose > 3) obatched(clog) << "canonicalized src=" << dwarfsrc << " alias=" << dwarfsrc_canon << endl; + } - ps_upsert_files - .reset() - .bind(1, dwarfsrc_canon) - .step_ok_done(); + ps_upsert_files + .reset() + .bind(1, dwarfsrc_canon) + .step_ok_done(); - ps_upsert_sref - .reset() - .bind(1, buildid) - .bind(2, dwarfsrc_canon) - .step_ok_done(); - } + ps_upsert_sref + .reset() + .bind(1, buildid) + .bind(2, dwarfsrc_canon) + .step_ok_done(); fts_sref ++; } @@ -2558,7 +2608,9 @@ scan_archive_file (const string& rps, const stat_t& st, st.st_mtime, my_fts_executable, my_fts_debuginfo, my_fts_sref, my_fts_sdef, my_fts_sref_complete_p); - inc_metric ("scanned_total","source",archive_extension + " archive"); + add_metric ("scanned_bytes_total","source",archive_extension + " archive", + st.st_size); + inc_metric ("scanned_files_total","source",archive_extension + " archive"); add_metric("found_debuginfo_total","source",archive_extension + " archive", my_fts_debuginfo); add_metric("found_executable_total","source",archive_extension + " archive", @@ -2667,7 +2719,8 @@ thread_main_scanner (void* arg) add_metric("thread_busy", "role", "scan", -1); bool gotone = scanq.wait_front(p); add_metric("thread_busy", "role", "scan", 1); - if (! gotone) continue; // or break + + if (! gotone) continue; // go back to waiting try { @@ -2706,7 +2759,9 @@ thread_main_scanner (void* arg) e.report(cerr); } - inc_metric("thread_work_total", "role","scan"); + // finished a scanning step -- not a "loop", because we just + // consume the traversal loop's work, whenever + inc_metric("thread_work_total","role","scan"); } add_metric("thread_busy", "role", "scan", -1); @@ -2750,35 +2805,50 @@ scan_source_paths() { if (interrupted) break; + if (sigusr2 != forced_groom_count) // stop early if groom triggered + { + scanq.clear(); // clear previously issued work for scanner threads + break; + } + fts_scanned ++; if (verbose > 2) obatched(clog) << "fts traversing " << f->fts_path << endl; - /* Found a file. Convert it to an absolute path, so - the buildid database does not have relative path - names that are unresolvable from a subsequent run - in a different cwd. */ - char *rp = realpath(f->fts_path, NULL); - if (rp == NULL) - continue; // ignore dangling symlink or such - string rps = string(rp); - free (rp); - - bool ri = !regexec (&file_include_regex, rps.c_str(), 0, 0, 0); - bool rx = !regexec (&file_exclude_regex, rps.c_str(), 0, 0, 0); - if (!ri || rx) - { - if (verbose > 3) - obatched(clog) << "fts skipped by regex " << (!ri ? "I" : "") << (rx ? "X" : "") << endl; - fts_regex ++; - continue; - } - switch (f->fts_info) { case FTS_F: - scanq.push_back (make_pair(rps, *f->fts_statp)); + { + /* Found a file. Convert it to an absolute path, so + the buildid database does not have relative path + names that are unresolvable from a subsequent run + in a different cwd. */ + char *rp = realpath(f->fts_path, NULL); + if (rp == NULL) + continue; // ignore dangling symlink or such + string rps = string(rp); + free (rp); + + bool ri = !regexec (&file_include_regex, rps.c_str(), 0, 0, 0); + bool rx = !regexec (&file_exclude_regex, rps.c_str(), 0, 0, 0); + if (!ri || rx) + { + if (verbose > 3) + obatched(clog) << "fts skipped by regex " + << (!ri ? "I" : "") << (rx ? "X" : "") << endl; + fts_regex ++; + if (!ri) + inc_metric("traversed_total","type","file-skipped-I"); + if (rx) + inc_metric("traversed_total","type","file-skipped-X"); + } + else + { + scanq.push_back (make_pair(rps, *f->fts_statp)); + inc_metric("traversed_total","type","file"); + } + } break; case FTS_ERR: @@ -2788,11 +2858,20 @@ scan_source_paths() auto x = libc_exception(f->fts_errno, string("fts traversal ") + string(f->fts_path)); x.report(cerr); } + inc_metric("traversed_total","type","error"); + break; + + case FTS_SL: // ignore, but count because debuginfod -L would traverse these + inc_metric("traversed_total","type","symlink"); break; - default: - ; - /* ignore */ + case FTS_D: // ignore + inc_metric("traversed_total","type","directory"); + break; + + default: // ignore + inc_metric("traversed_total","type","other"); + break; } } gettimeofday (&tv_end, NULL); @@ -2808,7 +2887,6 @@ thread_main_fts_source_paths (void* arg) { (void) arg; // ignore; we operate on global data - sig_atomic_t forced_rescan_count = 0; set_metric("thread_tid", "role","traverse", tid()); add_metric("thread_count", "role", "traverse", 1); @@ -2838,6 +2916,7 @@ thread_main_fts_source_paths (void* arg) set_metric("thread_busy", "role","traverse", 1); scan_source_paths(); last_rescan = time(NULL); // NB: now was before scanning + // finished a traversal loop inc_metric("thread_work_total", "role","traverse"); set_metric("thread_busy", "role","traverse", 0); } @@ -2889,6 +2968,8 @@ void groom() struct timeval tv_start, tv_end; gettimeofday (&tv_start, NULL); + database_stats_report(); + // scan for files that have disappeared sqlite_ps files (db, "check old files", "select s.mtime, s.file, f.name from " BUILDIDS "_file_mtime_scanned s, " BUILDIDS "_files f " @@ -2916,7 +2997,13 @@ void groom() files_del_f_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done(); files_del_r_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done(); files_del_scan.reset().bind(1,fileid).bind(2,mtime).step_ok_done(); + inc_metric("groomed_total", "decision", "stale"); } + else + inc_metric("groomed_total", "decision", "fresh"); + + if (sigusr1 != forced_rescan_count) // stop early if scan triggered + break; } files.reset(); @@ -2953,7 +3040,6 @@ void groom() static void* thread_main_groom (void* /*arg*/) { - sig_atomic_t forced_groom_count = 0; set_metric("thread_tid", "role", "groom", tid()); add_metric("thread_count", "role", "groom", 1); @@ -2982,6 +3068,7 @@ thread_main_groom (void* /*arg*/) set_metric("thread_busy", "role", "groom", 1); groom (); last_groom = time(NULL); // NB: now was before grooming + // finished a grooming loop inc_metric("thread_work_total", "role", "groom"); set_metric("thread_busy", "role", "groom", 0); } |