PR32218: debuginfod-client: support very long source file names

debuginfod clients & servers may sometimes encounter very long source file names. Previously, the client would synthesize a path name like $CACHEDIR/$BUILDID/source-$PATHNAME where $PATHNAME was a funky ##-encoded version of the entire source path name. See https://sourceware.org/PR32218 for a horror case. This can get too long to store as a single component of a file system pathname (e.g. linux/limits.h NAME_MAX), resulting on client-side errors even after a successful download. New code switches encoding of the $PATHNAME part to use less escaping, and a merciless truncation to the tail part of the filename. (We keep the tail rather than the head, so that the extension is preserved, which makes some consumers happier.) To limit collision damage from truncation, we add also insert a goofy hash (4-byte DJBX33A) of the name into the path name. The result is a relatively short name: $CACHEDIR/$BUILDID/source-$HASH-$NAMETAIL This is a transparent change to clients, who are not to make any assumptions about cache file naming structure. However, one existing test did make such assumptions, so is fixed with some globness. A new test is also added, using a pre-baked tarball with a very long srcfile name. Signed-off-by: Frank Ch. Eigler <[email protected]>
author: Frank Ch. Eigler <[email protected]> 2024-10-10 16:30:19 -0400
committer: Frank Ch. Eigler <[email protected]> 2024-10-16 09:49:35 -0400
commit: 6814e0edc112583428114bc20f8d78f864c57128 (patch)
tree: 5201ec0283a1994c445157c6fc3091c588df381e /debuginfod
parent: b68f34725229b08380a1612899b0537f8f597dad (diff)
1 files changed, 70 insertions, 38 deletions
diff --git a/debuginfod/debuginfod-client.c b/debuginfod/debuginfod-client.c
index 24ede19a..deff19ff 100644
--- a/debuginfod/debuginfod-client.c
+++ b/debuginfod/debuginfod-client.c
@@ -1201,29 +1201,69 @@ perform_queries(CURLM *curlm, CURL **target_handle, struct handle_data *data, de
 /* Copy SRC to DEST, s,/,#,g */
 
 static void
-path_escape (const char *src, char *dest)
+path_escape (const char *src, char *dest, size_t dest_len)
 {
-  unsigned q = 0;
+  /* PR32218: Reversibly-escaping src character-by-character, for
+     large enough strings, risks ENAMETOOLONG errors.  For long names,
+     a simple hash based generated name instead, while still
+     attempting to preserve the as much of the content as possible.
+     It's possible that absurd choices of incoming names will collide
+     or still get truncated, but c'est la vie.
+  */
+  
+  /* Compute a three-way min() for the actual output string generated. */
+  assert (dest_len > 10); /* Space enough for degenerate case of
+                             "HASHHASH-\0". NB: dest_len is not
+                             user-controlled. */
+  /* Use only NAME_MAX/2 characters in the output file name.
+     ENAMETOOLONG has been observed even on 300-ish character names on
+     some filesystems. */
+  const size_t max_dest_len = NAME_MAX/2;
+  dest_len = dest_len > max_dest_len ? max_dest_len : dest_len;
+  /* Use only strlen(src)+10 bytes, if that's smaller.  Yes, we could
+     just fit an entire escaped name in there in theory, without the
+     hash+etc.  But then again the hashing protects against #-escape
+     aliasing collisions: "foo[bar" "foo]bar" both escape to
+     "foo#bar", thus aliasing, but have different "HASH-foo#bar".
+  */
+  const size_t hash_prefix_destlen = strlen(src)+10; /* DEADBEEF-src\0 */
+  dest_len = dest_len > hash_prefix_destlen ? hash_prefix_destlen : dest_len;
 
-  for (unsigned fi=0; q < PATH_MAX-2; fi++) /* -2, escape is 2 chars.  */
-    switch (src[fi])
-      {
-      case '\0':
-        dest[q] = '\0';
-	return;
-      case '/': /* escape / to prevent dir escape */
-        dest[q++]='#';
-        dest[q++]='#';
-        break;
-      case '#': /* escape # to prevent /# vs #/ collisions */
-        dest[q++]='#';
-        dest[q++]='_';
-        break;
-      default:
-        dest[q++]=src[fi];
-      }
+  char *dest_write = dest + dest_len - 1;
+  (*dest_write--) = '\0'; /* Ensure a \0 there. */
 
-  dest[q] = '\0';
+  /* Copy from back toward front, preferring to keep the .extension. */
+  for (int fi=strlen(src)-1; fi >= 0 && dest_write >= dest; fi--)
+    {
+      char src_char = src[fi];
+      switch (src_char)
+        {
+          /* Pass through ordinary identifier chars. */
+        case '.': case '-': case '_':
+        case 'a'...'z':
+        case 'A'...'Z':
+        case '0'...'9':
+          *dest_write-- = src_char;
+          break;
+          
+          /* Replace everything else, esp. security-sensitive /. */
+        default:
+          *dest_write-- = '#';
+          break;
+        }
+    }
+
+  /* djb2 hash algorithm: DJBX33A */
+  unsigned long hash = 5381;
+  const char *c = src;
+  while (*c)
+    hash = ((hash << 5) + hash) + *c++;
+  char name_hash_str [9];
+  /* Truncate to 4 bytes; along with the remaining hundredish bytes of text,
+     should be ample against accidental collisions. */
+  snprintf (name_hash_str, sizeof(name_hash_str), "%08x", (unsigned int) hash);
+  memcpy (&dest[0], name_hash_str, 8); /* Overwrite the first few characters */
+  dest[8] = '-'; /* Add a bit of punctuation to make hash stand out */
 }
 
 /* Attempt to update the atime */
@@ -1267,7 +1307,6 @@ extract_section (int fd, const char *section, char *fd_path, char **usr_path)
     }
 
   int sec_fd = -1;
-  char *escaped_name = NULL;
   char *sec_path_tmp = NULL;
   Elf_Scn *scn = NULL;
 
@@ -1332,16 +1371,10 @@ extract_section (int fd, const char *section, char *fd_path, char **usr_path)
 	      --i;
 	    }
 
-	  escaped_name = malloc (strlen (section) * 2 + 1);
-	  if (escaped_name == NULL)
-	    {
-	      rc = -ENOMEM;
-	      goto out;
-	    }
-	  path_escape (section, escaped_name);
-
+          char suffix[NAME_MAX];
+	  path_escape (section, suffix, sizeof(suffix));
 	  rc = asprintf (&sec_path_tmp, "%s/section-%s.XXXXXX",
-			 fd_path, escaped_name);
+			 fd_path, suffix);
 	  if (rc == -1)
 	    {
 	      rc = -ENOMEM;
@@ -1396,8 +1429,6 @@ out2:
   free (sec_path_tmp);
 
 out1:
-  free (escaped_name);
-
 out:
   elf_end (elf);
   return rc;
@@ -1747,7 +1778,7 @@ debuginfod_query_server_by_buildid (debuginfod_client *c,
   char *target_cache_dir = NULL;
   char *target_cache_path = NULL;
   char *target_cache_tmppath = NULL;
-  char suffix[PATH_MAX + 1]; /* +1 for zero terminator.  */
+  char suffix[NAME_MAX];
   char build_id_bytes[MAX_BUILD_ID_BYTES * 2 + 1];
   int vfd = c->verbose_fd;
   int rc, r;
@@ -1861,13 +1892,13 @@ debuginfod_query_server_by_buildid (debuginfod_client *c,
 	  goto out;
 	}
 
-      path_escape (filename, suffix);
+      path_escape (filename, suffix, sizeof(suffix));
       /* If the DWARF filenames are super long, this could exceed
          PATH_MAX and truncate/collide.  Oh well, that'll teach
          them! */
     }
   else if (section != NULL)
-    path_escape (section, suffix);
+    path_escape (section, suffix, sizeof(suffix));
   else
     suffix[0] = '\0';
 
@@ -1879,7 +1910,8 @@ debuginfod_query_server_by_buildid (debuginfod_client *c,
      cache_path:        $HOME/.cache
      target_cache_dir:  $HOME/.cache/0123abcd
      target_cache_path: $HOME/.cache/0123abcd/debuginfo
-     target_cache_path: $HOME/.cache/0123abcd/source#PATH#TO#SOURCE ?
+     target_cache_path: $HOME/.cache/0123abcd/executable-.debug_info
+     target_cache_path: $HOME/.cache/0123abcd/source-HASH-#PATH#TO#SOURCE
   */
 
   cache_path = make_cache_path();
@@ -1891,10 +1923,10 @@ debuginfod_query_server_by_buildid (debuginfod_client *c,
   xalloc_str (target_cache_dir, "%s/%s", cache_path, build_id_bytes);
   (void) mkdir (target_cache_dir, 0700); // failures with this mkdir would be caught later too
 
-  if (section != NULL)
+  if (suffix[0] != '\0') /* section, source queries */
     xalloc_str (target_cache_path, "%s/%s-%s", target_cache_dir, type, suffix);
   else
-    xalloc_str (target_cache_path, "%s/%s%s", target_cache_dir, type, suffix);
+    xalloc_str (target_cache_path, "%s/%s", target_cache_dir, type);
   xalloc_str (target_cache_tmppath, "%s.XXXXXX", target_cache_path);
 
   /* XXX combine these */
author	Frank Ch. Eigler <[email protected]>	2024-10-10 16:30:19 -0400
committer	Frank Ch. Eigler <[email protected]>	2024-10-16 09:49:35 -0400
commit	6814e0edc112583428114bc20f8d78f864c57128 (patch)
tree	5201ec0283a1994c445157c6fc3091c588df381e /debuginfod
parent	b68f34725229b08380a1612899b0537f8f597dad (diff)