Browse code

add functions readGMT() and deduplicateGeneSets()

Axel Klenk authored on 25/04/2024 17:15:53
Showing 6 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: GSVA
2
-Version: 1.51.14
2
+Version: 1.51.15
3 3
 Title: Gene Set Variation Analysis for Microarray and RNA-Seq Data
4 4
 Authors@R: c(person("Robert", "Castelo", role=c("aut", "cre"), email="[email protected]"),
5 5
              person("Justin", "Guinney", role="aut", email="[email protected]"),
... ...
@@ -69,9 +69,11 @@ importFrom(stats,rnorm)
69 69
 importFrom(stats,rpois)
70 70
 importFrom(stats,sd)
71 71
 importFrom(utils,capture.output)
72
+importFrom(utils,head)
72 73
 importFrom(utils,installed.packages)
73 74
 importFrom(utils,read.csv)
74 75
 importFrom(utils,setTxtProgressBar)
76
+importFrom(utils,tail)
75 77
 importFrom(utils,txtProgressBar)
76 78
 importFrom(utils,write.csv)
77 79
 importMethodsFrom(Biobase,annotation)
... ...
@@ -14,7 +14,7 @@
14 14
 #'
15 15
 #' @importFrom graphics plot
16 16
 #' @importFrom stats ecdf na.omit rnorm rpois sd
17
-#' @importFrom utils installed.packages setTxtProgressBar txtProgressBar
17
+#' @importFrom utils installed.packages setTxtProgressBar txtProgressBar head tail
18 18
 #' read.csv write.csv capture.output
19 19
 #' @importFrom Matrix nnzero
20 20
 #' @importFrom S4Vectors SimpleList DataFrame
... ...
@@ -398,6 +398,131 @@ setMethod("geneSetSizes", signature("GsvaExprData"),
398 398
           })
399 399
 
400 400
 
401
+### ----- helper functions for gene set I/O and preprocessing -----
402
+
403
+#' @title Handling of Duplicated Gene Set Names
404
+#' 
405
+#' @description Offers a choice of ways for handling duplicated gene set names
406
+#' that may not be suitable as input to other gene set analysis functions.
407
+#' 
408
+#' @param geneSets A named list of gene sets represented as character vectors
409
+#' of gene IDs as e.g. returned by [`readGMT`].
410
+#'
411
+#' @param deduplUse A character vector of length 1 specifying one of several
412
+#' methods to handle duplicated gene set names.
413
+#' Duplicated gene set names are explicitly forbidden by the
414
+#' [GMT file format specification](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats)
415
+#' but can nevertheless be encountered in the wild.
416
+#' The available choices are:
417
+#' * `first` (the default): drops all gene sets whose names are [`duplicated`]
418
+#' according to the base R function and retains only the first occurence of a
419
+#' gene set name.
420
+#' * `drop`:  removes *all* gene sets that have a duplicated name, including its
421
+#' first occurrence.
422
+#' * `union`: replaces gene sets with duplicated names by a single gene set
423
+#' containing the union of all their gene IDs.
424
+#' * `smallest`: drops gene sets with duplicated names and retains only the
425
+#' smallest of them, i.e. the one with the fewest gene IDs.  If there are
426
+#' several smallest gene sets, the first will be selected.
427
+#' * `largest`: drops gene sets with duplicated names and retains only the
428
+#' largest of them, i.e. the one with the most gene IDs.  If there are
429
+#' several largest gene sets, the first will be selected.
430
+#'
431
+#' @return A named list of gene sets that represented as character vectors of
432
+#' gene IDs.
433
+#' 
434
+#' @aliases deduplicateGeneSets
435
+#' @name deduplicateGeneSets
436
+#' @rdname deduplicateGeneSets
437
+#' 
438
+deduplicateGeneSets <- function(geneSets,
439
+                                deduplUse = c("first", "drop", "union",
440
+                                              "smallest", "largest")) {
441
+    ddUse <- match.arg(deduplUse)
442
+    isNameDuplicated <- duplicated(names(geneSets))
443
+    duplicatedNames <- unique(names(geneSets[isNameDuplicated]))
444
+
445
+    ## a nested list containing sublists of duplicated gene sets
446
+    duplicatedGeneSets <- sapply(duplicatedNames,
447
+                                 function(dn, gs) unname(gs[dn == names(gs)]),
448
+                                 gs = geneSets, simplify=FALSE)
449
+
450
+    ## transformation function operating on sublists of such nested lists,
451
+    ## returning a single deduplicated gene set, i.e. character vector
452
+    ddFunc <- switch(ddUse,
453
+                     union=function(dgs) Reduce(union, dgs),
454
+                     smallest=function(dgs) dgs[which.min(lengths(dgs))],
455
+                     largest=function(dgs) dgs[which.max(lengths(dgs))])
456
+
457
+    ## apply transformation function to deduplicate gene sets (if requested)
458
+    if(!is.null(ddFunc))
459
+        dedupl <- sapply(duplicatedGeneSets, FUN=ddFunc, simplify=FALSE)
460
+
461
+    ## drop all duplicate gene sets (sufficient for default of "first")
462
+    geneSets[isNameDuplicated] <- NULL
463
+
464
+    ## remove or replace non-duplicated with deduplicated gene sets
465
+    if(ddUse == "drop") {
466
+        geneSets[duplicatedNames] <- NULL
467
+    } else if(!is.null(ddFunc)) {
468
+        geneSets[duplicatedNames] <- dedupl
469
+    }
470
+
471
+    return(geneSets)
472
+}
473
+
474
+
475
+#' @title Import Gene Sets from a GMT File
476
+#' 
477
+#' @description Imports a list of gene sets from a GMT (Gene Matrix Transposed)
478
+#' format file, offering a choice of ways to handle duplicated gene set names.
479
+#' 
480
+#' @param con A connection object or character string containing e.g.
481
+#' a file name or URL.  This is directly passed to [`readLines`] and hence may
482
+#' contain anything that `readLines()` can handle.
483
+#'
484
+#' @param deduplUse With the exception of the special method `custom`, all
485
+#' handling of duplicated gene set names is delegated to function
486
+#' [`deduplicateGeneSets`] and this argument is directly passed on.
487
+#' Please see `?deduplicatedGeneSets`.
488
+#' Using `deduplUse=custom` allows import of the GMT file for manual inspection
489
+#' and its content and remedy is the user's responsibility.  However, `gsva()`
490
+#' will *not* accept the result for further use unless it is modified to have
491
+#' duplicated gene set names removed.
492
+#'
493
+#' @return A named list of gene sets that represented as character vectors of
494
+#' gene IDs.
495
+#' 
496
+#' @seealso [`readLines`], [`deduplicateGeneSets`]
497
+#'
498
+#' @aliases readGMT
499
+#' @name readGMT
500
+#' @rdname readGMT
501
+#' 
502
+readGMT <- function(con,
503
+                    deduplUse = c("first", "drop", "union",
504
+                                  "smallest", "largest", "custom")) {
505
+    ddUse <- match.arg(deduplUse)
506
+    gmtLines <- strsplit(readLines(con=con), split="\t", fixed=TRUE)
507
+    gmt <- lapply(gmtLines, tail, -2)
508
+    names(gmt) <- sapply(gmtLines, head, 1)
509
+
510
+    if(anyDuplicated(names(gmt)) > 0) {
511
+        warning("GMT contains duplicated gene set names; deduplicated",
512
+                " using method: ", ddUse)
513
+        
514
+        if(ddUse != "custom") {
515
+            gmt <- deduplicateGeneSets(geneSets=gmt, deduplUse=ddUse)
516
+        } else {
517
+            warning("Method 'custom' requires YOU to remedy duplicate ",
518
+                    "gene set names as gsva() will not accept them")
519
+        }
520
+    }
521
+    
522
+    return(gmt)
523
+}
524
+
525
+
401 526
 ### ----- methods for data pre-/post-processing -----
402 527
 
403 528
 ## unwrapData: extract a data matrix from a container object
404 529
new file mode 100644
... ...
@@ -0,0 +1,45 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/gsvaNewAPI.R
3
+\name{deduplicateGeneSets}
4
+\alias{deduplicateGeneSets}
5
+\title{Handling of Duplicated Gene Set Names}
6
+\usage{
7
+deduplicateGeneSets(
8
+  geneSets,
9
+  deduplUse = c("first", "drop", "union", "smallest", "largest")
10
+)
11
+}
12
+\arguments{
13
+\item{geneSets}{A named list of gene sets represented as character vectors
14
+of gene IDs as e.g. returned by \code{\link{readGMT}}.}
15
+
16
+\item{deduplUse}{A character vector of length 1 specifying one of several
17
+methods to handle duplicated gene set names.
18
+Duplicated gene set names are explicitly forbidden by the
19
+\href{https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}{GMT file format specification}
20
+but can nevertheless be encountered in the wild.
21
+The available choices are:
22
+\itemize{
23
+\item \code{first} (the default): drops all gene sets whose names are \code{\link{duplicated}}
24
+according to the base R function and retains only the first occurence of a
25
+gene set name.
26
+\item \code{drop}:  removes \emph{all} gene sets that have a duplicated name, including its
27
+first occurrence.
28
+\item \code{union}: replaces gene sets with duplicated names by a single gene set
29
+containing the union of all their gene IDs.
30
+\item \code{smallest}: drops gene sets with duplicated names and retains only the
31
+smallest of them, i.e. the one with the fewest gene IDs.  If there are
32
+several smallest gene sets, the first will be selected.
33
+\item \code{largest}: drops gene sets with duplicated names and retains only the
34
+largest of them, i.e. the one with the most gene IDs.  If there are
35
+several largest gene sets, the first will be selected.
36
+}}
37
+}
38
+\value{
39
+A named list of gene sets that represented as character vectors of
40
+gene IDs.
41
+}
42
+\description{
43
+Offers a choice of ways for handling duplicated gene set names
44
+that may not be suitable as input to other gene set analysis functions.
45
+}
0 46
new file mode 100644
... ...
@@ -0,0 +1,36 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/gsvaNewAPI.R
3
+\name{readGMT}
4
+\alias{readGMT}
5
+\title{Import Gene Sets from a GMT File}
6
+\usage{
7
+readGMT(
8
+  con,
9
+  deduplUse = c("first", "drop", "union", "smallest", "largest", "custom")
10
+)
11
+}
12
+\arguments{
13
+\item{con}{A connection object or character string containing e.g.
14
+a file name or URL.  This is directly passed to \code{\link{readLines}} and hence may
15
+contain anything that \code{readLines()} can handle.}
16
+
17
+\item{deduplUse}{With the exception of the special method \code{custom}, all
18
+handling of duplicated gene set names is delegated to function
19
+\code{\link{deduplicateGeneSets}} and this argument is directly passed on.
20
+Please see \code{?deduplicatedGeneSets}.
21
+Using \code{deduplUse=custom} allows import of the GMT file for manual inspection
22
+and its content and remedy is the user's responsibility.  However, \code{gsva()}
23
+will \emph{not} accept the result for further use unless it is modified to have
24
+duplicated gene set names removed.}
25
+}
26
+\value{
27
+A named list of gene sets that represented as character vectors of
28
+gene IDs.
29
+}
30
+\description{
31
+Imports a list of gene sets from a GMT (Gene Matrix Transposed)
32
+format file, offering a choice of ways to handle duplicated gene set names.
33
+}
34
+\seealso{
35
+\code{\link{readLines}}, \code{\link{deduplicateGeneSets}}
36
+}