... | ... |
@@ -1,5 +1,5 @@ |
1 | 1 |
Package: GSVA |
2 |
-Version: 1.51.14 |
|
2 |
+Version: 1.51.15 |
|
3 | 3 |
Title: Gene Set Variation Analysis for Microarray and RNA-Seq Data |
4 | 4 |
Authors@R: c(person("Robert", "Castelo", role=c("aut", "cre"), email="[email protected]"), |
5 | 5 |
person("Justin", "Guinney", role="aut", email="[email protected]"), |
... | ... |
@@ -69,9 +69,11 @@ importFrom(stats,rnorm) |
69 | 69 |
importFrom(stats,rpois) |
70 | 70 |
importFrom(stats,sd) |
71 | 71 |
importFrom(utils,capture.output) |
72 |
+importFrom(utils,head) |
|
72 | 73 |
importFrom(utils,installed.packages) |
73 | 74 |
importFrom(utils,read.csv) |
74 | 75 |
importFrom(utils,setTxtProgressBar) |
76 |
+importFrom(utils,tail) |
|
75 | 77 |
importFrom(utils,txtProgressBar) |
76 | 78 |
importFrom(utils,write.csv) |
77 | 79 |
importMethodsFrom(Biobase,annotation) |
... | ... |
@@ -14,7 +14,7 @@ |
14 | 14 |
#' |
15 | 15 |
#' @importFrom graphics plot |
16 | 16 |
#' @importFrom stats ecdf na.omit rnorm rpois sd |
17 |
-#' @importFrom utils installed.packages setTxtProgressBar txtProgressBar |
|
17 |
+#' @importFrom utils installed.packages setTxtProgressBar txtProgressBar head tail |
|
18 | 18 |
#' read.csv write.csv capture.output |
19 | 19 |
#' @importFrom Matrix nnzero |
20 | 20 |
#' @importFrom S4Vectors SimpleList DataFrame |
... | ... |
@@ -398,6 +398,131 @@ setMethod("geneSetSizes", signature("GsvaExprData"), |
398 | 398 |
}) |
399 | 399 |
|
400 | 400 |
|
401 |
+### ----- helper functions for gene set I/O and preprocessing ----- |
|
402 |
+ |
|
403 |
+#' @title Handling of Duplicated Gene Set Names |
|
404 |
+#' |
|
405 |
+#' @description Offers a choice of ways for handling duplicated gene set names |
|
406 |
+#' that may not be suitable as input to other gene set analysis functions. |
|
407 |
+#' |
|
408 |
+#' @param geneSets A named list of gene sets represented as character vectors |
|
409 |
+#' of gene IDs as e.g. returned by [`readGMT`]. |
|
410 |
+#' |
|
411 |
+#' @param deduplUse A character vector of length 1 specifying one of several |
|
412 |
+#' methods to handle duplicated gene set names. |
|
413 |
+#' Duplicated gene set names are explicitly forbidden by the |
|
414 |
+#' [GMT file format specification](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats) |
|
415 |
+#' but can nevertheless be encountered in the wild. |
|
416 |
+#' The available choices are: |
|
417 |
+#' * `first` (the default): drops all gene sets whose names are [`duplicated`] |
|
418 |
+#' according to the base R function and retains only the first occurence of a |
|
419 |
+#' gene set name. |
|
420 |
+#' * `drop`: removes *all* gene sets that have a duplicated name, including its |
|
421 |
+#' first occurrence. |
|
422 |
+#' * `union`: replaces gene sets with duplicated names by a single gene set |
|
423 |
+#' containing the union of all their gene IDs. |
|
424 |
+#' * `smallest`: drops gene sets with duplicated names and retains only the |
|
425 |
+#' smallest of them, i.e. the one with the fewest gene IDs. If there are |
|
426 |
+#' several smallest gene sets, the first will be selected. |
|
427 |
+#' * `largest`: drops gene sets with duplicated names and retains only the |
|
428 |
+#' largest of them, i.e. the one with the most gene IDs. If there are |
|
429 |
+#' several largest gene sets, the first will be selected. |
|
430 |
+#' |
|
431 |
+#' @return A named list of gene sets that represented as character vectors of |
|
432 |
+#' gene IDs. |
|
433 |
+#' |
|
434 |
+#' @aliases deduplicateGeneSets |
|
435 |
+#' @name deduplicateGeneSets |
|
436 |
+#' @rdname deduplicateGeneSets |
|
437 |
+#' |
|
438 |
+deduplicateGeneSets <- function(geneSets, |
|
439 |
+ deduplUse = c("first", "drop", "union", |
|
440 |
+ "smallest", "largest")) { |
|
441 |
+ ddUse <- match.arg(deduplUse) |
|
442 |
+ isNameDuplicated <- duplicated(names(geneSets)) |
|
443 |
+ duplicatedNames <- unique(names(geneSets[isNameDuplicated])) |
|
444 |
+ |
|
445 |
+ ## a nested list containing sublists of duplicated gene sets |
|
446 |
+ duplicatedGeneSets <- sapply(duplicatedNames, |
|
447 |
+ function(dn, gs) unname(gs[dn == names(gs)]), |
|
448 |
+ gs = geneSets, simplify=FALSE) |
|
449 |
+ |
|
450 |
+ ## transformation function operating on sublists of such nested lists, |
|
451 |
+ ## returning a single deduplicated gene set, i.e. character vector |
|
452 |
+ ddFunc <- switch(ddUse, |
|
453 |
+ union=function(dgs) Reduce(union, dgs), |
|
454 |
+ smallest=function(dgs) dgs[which.min(lengths(dgs))], |
|
455 |
+ largest=function(dgs) dgs[which.max(lengths(dgs))]) |
|
456 |
+ |
|
457 |
+ ## apply transformation function to deduplicate gene sets (if requested) |
|
458 |
+ if(!is.null(ddFunc)) |
|
459 |
+ dedupl <- sapply(duplicatedGeneSets, FUN=ddFunc, simplify=FALSE) |
|
460 |
+ |
|
461 |
+ ## drop all duplicate gene sets (sufficient for default of "first") |
|
462 |
+ geneSets[isNameDuplicated] <- NULL |
|
463 |
+ |
|
464 |
+ ## remove or replace non-duplicated with deduplicated gene sets |
|
465 |
+ if(ddUse == "drop") { |
|
466 |
+ geneSets[duplicatedNames] <- NULL |
|
467 |
+ } else if(!is.null(ddFunc)) { |
|
468 |
+ geneSets[duplicatedNames] <- dedupl |
|
469 |
+ } |
|
470 |
+ |
|
471 |
+ return(geneSets) |
|
472 |
+} |
|
473 |
+ |
|
474 |
+ |
|
475 |
+#' @title Import Gene Sets from a GMT File |
|
476 |
+#' |
|
477 |
+#' @description Imports a list of gene sets from a GMT (Gene Matrix Transposed) |
|
478 |
+#' format file, offering a choice of ways to handle duplicated gene set names. |
|
479 |
+#' |
|
480 |
+#' @param con A connection object or character string containing e.g. |
|
481 |
+#' a file name or URL. This is directly passed to [`readLines`] and hence may |
|
482 |
+#' contain anything that `readLines()` can handle. |
|
483 |
+#' |
|
484 |
+#' @param deduplUse With the exception of the special method `custom`, all |
|
485 |
+#' handling of duplicated gene set names is delegated to function |
|
486 |
+#' [`deduplicateGeneSets`] and this argument is directly passed on. |
|
487 |
+#' Please see `?deduplicatedGeneSets`. |
|
488 |
+#' Using `deduplUse=custom` allows import of the GMT file for manual inspection |
|
489 |
+#' and its content and remedy is the user's responsibility. However, `gsva()` |
|
490 |
+#' will *not* accept the result for further use unless it is modified to have |
|
491 |
+#' duplicated gene set names removed. |
|
492 |
+#' |
|
493 |
+#' @return A named list of gene sets that represented as character vectors of |
|
494 |
+#' gene IDs. |
|
495 |
+#' |
|
496 |
+#' @seealso [`readLines`], [`deduplicateGeneSets`] |
|
497 |
+#' |
|
498 |
+#' @aliases readGMT |
|
499 |
+#' @name readGMT |
|
500 |
+#' @rdname readGMT |
|
501 |
+#' |
|
502 |
+readGMT <- function(con, |
|
503 |
+ deduplUse = c("first", "drop", "union", |
|
504 |
+ "smallest", "largest", "custom")) { |
|
505 |
+ ddUse <- match.arg(deduplUse) |
|
506 |
+ gmtLines <- strsplit(readLines(con=con), split="\t", fixed=TRUE) |
|
507 |
+ gmt <- lapply(gmtLines, tail, -2) |
|
508 |
+ names(gmt) <- sapply(gmtLines, head, 1) |
|
509 |
+ |
|
510 |
+ if(anyDuplicated(names(gmt)) > 0) { |
|
511 |
+ warning("GMT contains duplicated gene set names; deduplicated", |
|
512 |
+ " using method: ", ddUse) |
|
513 |
+ |
|
514 |
+ if(ddUse != "custom") { |
|
515 |
+ gmt <- deduplicateGeneSets(geneSets=gmt, deduplUse=ddUse) |
|
516 |
+ } else { |
|
517 |
+ warning("Method 'custom' requires YOU to remedy duplicate ", |
|
518 |
+ "gene set names as gsva() will not accept them") |
|
519 |
+ } |
|
520 |
+ } |
|
521 |
+ |
|
522 |
+ return(gmt) |
|
523 |
+} |
|
524 |
+ |
|
525 |
+ |
|
401 | 526 |
### ----- methods for data pre-/post-processing ----- |
402 | 527 |
|
403 | 528 |
## unwrapData: extract a data matrix from a container object |
404 | 529 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,45 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/gsvaNewAPI.R |
|
3 |
+\name{deduplicateGeneSets} |
|
4 |
+\alias{deduplicateGeneSets} |
|
5 |
+\title{Handling of Duplicated Gene Set Names} |
|
6 |
+\usage{ |
|
7 |
+deduplicateGeneSets( |
|
8 |
+ geneSets, |
|
9 |
+ deduplUse = c("first", "drop", "union", "smallest", "largest") |
|
10 |
+) |
|
11 |
+} |
|
12 |
+\arguments{ |
|
13 |
+\item{geneSets}{A named list of gene sets represented as character vectors |
|
14 |
+of gene IDs as e.g. returned by \code{\link{readGMT}}.} |
|
15 |
+ |
|
16 |
+\item{deduplUse}{A character vector of length 1 specifying one of several |
|
17 |
+methods to handle duplicated gene set names. |
|
18 |
+Duplicated gene set names are explicitly forbidden by the |
|
19 |
+\href{https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}{GMT file format specification} |
|
20 |
+but can nevertheless be encountered in the wild. |
|
21 |
+The available choices are: |
|
22 |
+\itemize{ |
|
23 |
+\item \code{first} (the default): drops all gene sets whose names are \code{\link{duplicated}} |
|
24 |
+according to the base R function and retains only the first occurence of a |
|
25 |
+gene set name. |
|
26 |
+\item \code{drop}: removes \emph{all} gene sets that have a duplicated name, including its |
|
27 |
+first occurrence. |
|
28 |
+\item \code{union}: replaces gene sets with duplicated names by a single gene set |
|
29 |
+containing the union of all their gene IDs. |
|
30 |
+\item \code{smallest}: drops gene sets with duplicated names and retains only the |
|
31 |
+smallest of them, i.e. the one with the fewest gene IDs. If there are |
|
32 |
+several smallest gene sets, the first will be selected. |
|
33 |
+\item \code{largest}: drops gene sets with duplicated names and retains only the |
|
34 |
+largest of them, i.e. the one with the most gene IDs. If there are |
|
35 |
+several largest gene sets, the first will be selected. |
|
36 |
+}} |
|
37 |
+} |
|
38 |
+\value{ |
|
39 |
+A named list of gene sets that represented as character vectors of |
|
40 |
+gene IDs. |
|
41 |
+} |
|
42 |
+\description{ |
|
43 |
+Offers a choice of ways for handling duplicated gene set names |
|
44 |
+that may not be suitable as input to other gene set analysis functions. |
|
45 |
+} |
0 | 46 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,36 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/gsvaNewAPI.R |
|
3 |
+\name{readGMT} |
|
4 |
+\alias{readGMT} |
|
5 |
+\title{Import Gene Sets from a GMT File} |
|
6 |
+\usage{ |
|
7 |
+readGMT( |
|
8 |
+ con, |
|
9 |
+ deduplUse = c("first", "drop", "union", "smallest", "largest", "custom") |
|
10 |
+) |
|
11 |
+} |
|
12 |
+\arguments{ |
|
13 |
+\item{con}{A connection object or character string containing e.g. |
|
14 |
+a file name or URL. This is directly passed to \code{\link{readLines}} and hence may |
|
15 |
+contain anything that \code{readLines()} can handle.} |
|
16 |
+ |
|
17 |
+\item{deduplUse}{With the exception of the special method \code{custom}, all |
|
18 |
+handling of duplicated gene set names is delegated to function |
|
19 |
+\code{\link{deduplicateGeneSets}} and this argument is directly passed on. |
|
20 |
+Please see \code{?deduplicatedGeneSets}. |
|
21 |
+Using \code{deduplUse=custom} allows import of the GMT file for manual inspection |
|
22 |
+and its content and remedy is the user's responsibility. However, \code{gsva()} |
|
23 |
+will \emph{not} accept the result for further use unless it is modified to have |
|
24 |
+duplicated gene set names removed.} |
|
25 |
+} |
|
26 |
+\value{ |
|
27 |
+A named list of gene sets that represented as character vectors of |
|
28 |
+gene IDs. |
|
29 |
+} |
|
30 |
+\description{ |
|
31 |
+Imports a list of gene sets from a GMT (Gene Matrix Transposed) |
|
32 |
+format file, offering a choice of ways to handle duplicated gene set names. |
|
33 |
+} |
|
34 |
+\seealso{ |
|
35 |
+\code{\link{readLines}}, \code{\link{deduplicateGeneSets}} |
|
36 |
+} |