Bioconductor Code: GSVA

Browse code

add functions readGMT() and deduplicateGeneSets()

Axel Klenk authored on 25/04/2024 17:15:53
Showing 6 changed files

DESCRIPTION index f65e26557..c4c0ca158 100644
NAMESPACE index 3e3579cd5..573a16a54 100644
R/GSVA-package.R index ba1cf2dee..ea20e3b5c 100644
R/gsvaNewAPI.R index b11864d5d..5bf36f3cd 100644
man/deduplicateGeneSets.Rd index 000000000..59ecf4223
man/readGMT.Rd index 000000000..0b4c5d2b0

History View file @ 2164828bc

@@ -1,5 +1,5 @@
                      Package: GSVA
                     -Version: 1.51.14
                     +Version: 1.51.15
                      Title: Gene Set Variation Analysis for Microarray and RNA-Seq Data
                      Authors@R: c(person("Robert", "Castelo", role=c("aut", "cre"), email="[email protected]"),
                                   person("Justin", "Guinney", role="aut", email="[email protected]"),

NAMESPACE

History View file @ 2164828bc

@@ -69,9 +69,11 @@ importFrom(stats,rnorm)
                      importFrom(stats,rpois)
                      importFrom(stats,sd)
                      importFrom(utils,capture.output)
                     +importFrom(utils,head)
                      importFrom(utils,installed.packages)
                      importFrom(utils,read.csv)
                      importFrom(utils,setTxtProgressBar)
                     +importFrom(utils,tail)
                      importFrom(utils,txtProgressBar)
                      importFrom(utils,write.csv)
                      importMethodsFrom(Biobase,annotation)

R/GSVA-package.R

History View file @ 2164828bc

@@ -14,7 +14,7 @@
                      #'
                      #' @importFrom graphics plot
                      #' @importFrom stats ecdf na.omit rnorm rpois sd
                     -#' @importFrom utils installed.packages setTxtProgressBar txtProgressBar
                     +#' @importFrom utils installed.packages setTxtProgressBar txtProgressBar head tail
                      #' read.csv write.csv capture.output
                      #' @importFrom Matrix nnzero
                      #' @importFrom S4Vectors SimpleList DataFrame

R/gsvaNewAPI.R

History View file @ 2164828bc

@@ -398,6 +398,131 @@ setMethod("geneSetSizes", signature("GsvaExprData"),
                                })
                     +### ----- helper functions for gene set I/O and preprocessing -----
+                    +
                     +#' @title Handling of Duplicated Gene Set Names
                     +#'
                     +#' @description Offers a choice of ways for handling duplicated gene set names
                     +#' that may not be suitable as input to other gene set analysis functions.
                     +#'
                     +#' @param geneSets A named list of gene sets represented as character vectors
                     +#' of gene IDs as e.g. returned by [`readGMT`].
                     +#'
                     +#' @param deduplUse A character vector of length 1 specifying one of several
                     +#' methods to handle duplicated gene set names.
                     +#' Duplicated gene set names are explicitly forbidden by the
                     +#' [GMT file format specification](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats)
                     +#' but can nevertheless be encountered in the wild.
                     +#' The available choices are:
                     +#' * `first` (the default): drops all gene sets whose names are [`duplicated`]
                     +#' according to the base R function and retains only the first occurence of a
                     +#' gene set name.
                     +#' * `drop`:  removes *all* gene sets that have a duplicated name, including its
                     +#' first occurrence.
                     +#' * `union`: replaces gene sets with duplicated names by a single gene set
                     +#' containing the union of all their gene IDs.
                     +#' * `smallest`: drops gene sets with duplicated names and retains only the
                     +#' smallest of them, i.e. the one with the fewest gene IDs.  If there are
                     +#' several smallest gene sets, the first will be selected.
                     +#' * `largest`: drops gene sets with duplicated names and retains only the
                     +#' largest of them, i.e. the one with the most gene IDs.  If there are
                     +#' several largest gene sets, the first will be selected.
                     +#'
                     +#' @return A named list of gene sets that represented as character vectors of
                     +#' gene IDs.
                     +#'
                     +#' @aliases deduplicateGeneSets
                     +#' @name deduplicateGeneSets
                     +#' @rdname deduplicateGeneSets
                     +#'
                     +deduplicateGeneSets <- function(geneSets,
                     +                                deduplUse = c("first", "drop", "union",
                     +                                              "smallest", "largest")) {
                     +    ddUse <- match.arg(deduplUse)
                     +    isNameDuplicated <- duplicated(names(geneSets))
                     +    duplicatedNames <- unique(names(geneSets[isNameDuplicated]))
+                    +
                     +    ## a nested list containing sublists of duplicated gene sets
                     +    duplicatedGeneSets <- sapply(duplicatedNames,
                     +                                 function(dn, gs) unname(gs[dn == names(gs)]),
                     +                                 gs = geneSets, simplify=FALSE)
+                    +
                     +    ## transformation function operating on sublists of such nested lists,
                     +    ## returning a single deduplicated gene set, i.e. character vector
                     +    ddFunc <- switch(ddUse,
                     +                     union=function(dgs) Reduce(union, dgs),
                     +                     smallest=function(dgs) dgs[which.min(lengths(dgs))],
                     +                     largest=function(dgs) dgs[which.max(lengths(dgs))])
+                    +
                     +    ## apply transformation function to deduplicate gene sets (if requested)
                     +    if(!is.null(ddFunc))
                     +        dedupl <- sapply(duplicatedGeneSets, FUN=ddFunc, simplify=FALSE)
+                    +
                     +    ## drop all duplicate gene sets (sufficient for default of "first")
                     +    geneSets[isNameDuplicated] <- NULL
+                    +
                     +    ## remove or replace non-duplicated with deduplicated gene sets
                     +    if(ddUse == "drop") {
                     +        geneSets[duplicatedNames] <- NULL
                     +    } else if(!is.null(ddFunc)) {
                     +        geneSets[duplicatedNames] <- dedupl
                     +    }
+                    +
                     +    return(geneSets)
                     +}
+                    +
+                    +
                     +#' @title Import Gene Sets from a GMT File
                     +#'
                     +#' @description Imports a list of gene sets from a GMT (Gene Matrix Transposed)
                     +#' format file, offering a choice of ways to handle duplicated gene set names.
                     +#'
                     +#' @param con A connection object or character string containing e.g.
                     +#' a file name or URL.  This is directly passed to [`readLines`] and hence may
                     +#' contain anything that `readLines()` can handle.
                     +#'
                     +#' @param deduplUse With the exception of the special method `custom`, all
                     +#' handling of duplicated gene set names is delegated to function
                     +#' [`deduplicateGeneSets`] and this argument is directly passed on.
                     +#' Please see `?deduplicatedGeneSets`.
                     +#' Using `deduplUse=custom` allows import of the GMT file for manual inspection
                     +#' and its content and remedy is the user's responsibility.  However, `gsva()`
                     +#' will *not* accept the result for further use unless it is modified to have
                     +#' duplicated gene set names removed.
                     +#'
                     +#' @return A named list of gene sets that represented as character vectors of
                     +#' gene IDs.
                     +#'
                     +#' @seealso [`readLines`], [`deduplicateGeneSets`]
                     +#'
                     +#' @aliases readGMT
                     +#' @name readGMT
                     +#' @rdname readGMT
                     +#'
                     +readGMT <- function(con,
                     +                    deduplUse = c("first", "drop", "union",
                     +                                  "smallest", "largest", "custom")) {
                     +    ddUse <- match.arg(deduplUse)
                     +    gmtLines <- strsplit(readLines(con=con), split="\t", fixed=TRUE)
                     +    gmt <- lapply(gmtLines, tail, -2)
                     +    names(gmt) <- sapply(gmtLines, head, 1)
+                    +
                     +    if(anyDuplicated(names(gmt)) > 0) {
                     +        warning("GMT contains duplicated gene set names; deduplicated",
                     +                " using method: ", ddUse)
+                    +
                     +        if(ddUse != "custom") {
                     +            gmt <- deduplicateGeneSets(geneSets=gmt, deduplUse=ddUse)
                     +        } else {
                     +            warning("Method 'custom' requires YOU to remedy duplicate ",
                     +                    "gene set names as gsva() will not accept them")
                     +        }
                     +    }
+                    +
                     +    return(gmt)
                     +}
+                    +
+                    +
                      ### ----- methods for data pre-/post-processing -----
                      ## unwrapData: extract a data matrix from a container object

man/deduplicateGeneSets.Rd

History View file @ 2164828bc

                     new file mode 100644
@@ -0,0 +1,45 @@
                     +% Generated by roxygen2: do not edit by hand
                     +% Please edit documentation in R/gsvaNewAPI.R
                     +\name{deduplicateGeneSets}
                     +\alias{deduplicateGeneSets}
                     +\title{Handling of Duplicated Gene Set Names}
                     +\usage{
                     +deduplicateGeneSets(
                     +  geneSets,
                     +  deduplUse = c("first", "drop", "union", "smallest", "largest")
                     +)
                     +}
                     +\arguments{
                     +\item{geneSets}{A named list of gene sets represented as character vectors
                     +of gene IDs as e.g. returned by \code{\link{readGMT}}.}
+                    +
                     +\item{deduplUse}{A character vector of length 1 specifying one of several
                     +methods to handle duplicated gene set names.
                     +Duplicated gene set names are explicitly forbidden by the
                     +\href{https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats}{GMT file format specification}
                     +but can nevertheless be encountered in the wild.
                     +The available choices are:
                     +\itemize{
                     +\item \code{first} (the default): drops all gene sets whose names are \code{\link{duplicated}}
                     +according to the base R function and retains only the first occurence of a
                     +gene set name.
                     +\item \code{drop}:  removes \emph{all} gene sets that have a duplicated name, including its
                     +first occurrence.
                     +\item \code{union}: replaces gene sets with duplicated names by a single gene set
                     +containing the union of all their gene IDs.
                     +\item \code{smallest}: drops gene sets with duplicated names and retains only the
                     +smallest of them, i.e. the one with the fewest gene IDs.  If there are
                     +several smallest gene sets, the first will be selected.
                     +\item \code{largest}: drops gene sets with duplicated names and retains only the
                     +largest of them, i.e. the one with the most gene IDs.  If there are
                     +several largest gene sets, the first will be selected.
                     +}}
                     +}
                     +\value{
                     +A named list of gene sets that represented as character vectors of
                     +gene IDs.
                     +}
                     +\description{
                     +Offers a choice of ways for handling duplicated gene set names
                     +that may not be suitable as input to other gene set analysis functions.
                     +}

man/readGMT.Rd

History View file @ 2164828bc

                     new file mode 100644
@@ -0,0 +1,36 @@
                     +% Generated by roxygen2: do not edit by hand
                     +% Please edit documentation in R/gsvaNewAPI.R
                     +\name{readGMT}
                     +\alias{readGMT}
                     +\title{Import Gene Sets from a GMT File}
                     +\usage{
                     +readGMT(
                     +  con,
                     +  deduplUse = c("first", "drop", "union", "smallest", "largest", "custom")
                     +)
                     +}
                     +\arguments{
                     +\item{con}{A connection object or character string containing e.g.
                     +a file name or URL.  This is directly passed to \code{\link{readLines}} and hence may
                     +contain anything that \code{readLines()} can handle.}
+                    +
                     +\item{deduplUse}{With the exception of the special method \code{custom}, all
                     +handling of duplicated gene set names is delegated to function
                     +\code{\link{deduplicateGeneSets}} and this argument is directly passed on.
                     +Please see \code{?deduplicatedGeneSets}.
                     +Using \code{deduplUse=custom} allows import of the GMT file for manual inspection
                     +and its content and remedy is the user's responsibility.  However, \code{gsva()}
                     +will \emph{not} accept the result for further use unless it is modified to have
                     +duplicated gene set names removed.}
                     +}
                     +\value{
                     +A named list of gene sets that represented as character vectors of
                     +gene IDs.
                     +}
                     +\description{
                     +Imports a list of gene sets from a GMT (Gene Matrix Transposed)
                     +format file, offering a choice of ways to handle duplicated gene set names.
                     +}
                     +\seealso{
                     +\code{\link{readLines}}, \code{\link{deduplicateGeneSets}}
                     +}