goana <- function(de,...) UseMethod("goana") goana.MArrayLM <- function(de, coef = ncol(de), geneid = rownames(de), FDR = 0.05, trend = FALSE, ...) # Gene ontology analysis of DE genes from linear model fit # Gordon Smyth and Yifang Hu # Created 20 June 2014. Last modified 11 Sep 2022. { # Avoid argument collision with default method dots <- names(list(...)) if("universe" %in% dots) stop("goana.MArrayLM defines its own universe",call.=FALSE) if((!is.logical(trend) || trend) && "covariate" %in% dots) stop("goana.MArrayLM defines it own covariate",call.=FALSE) # Check fit if(is.null(de$coefficients)) stop("de does not appear to be a valid MArrayLM fit object.") if(is.null(de$p.value)) stop("p.value not found in fit object, perhaps need to run eBayes first.") if(length(coef) != 1) stop("Only one coef can be specified.") ngenes <- nrow(de) # Check geneid # Can be either a vector of gene IDs or an annotation column name geneid <- as.character(geneid) if(length(geneid) == ngenes) { universe <- geneid } else { if(length(geneid) == 1L) { universe <- de$genes[[geneid]] if(is.null(universe)) stop("Column ",geneid," not found in de$genes") } else stop("geneid of incorrect length") } # Check trend # Can be logical, or a numeric vector of covariate values, or the name of the column containing the covariate values if(is.logical(trend)) { if(trend) { covariate <- de$Amean if(is.null(covariate)) stop("Amean not found in fit") } else { covariate <- NULL } } else { if(is.numeric(trend)) { if(!identical(length(trend),ngenes)) stop("If trend is numeric, then length must equal nrow(de)") covariate <- trend trend <- TRUE } else { if(is.character(trend)) { if(!identical(length(trend),1L)) stop("If trend is character, then length must be 1") covariate <- de$genes[[trend]] if(is.null(covariate)) stop("Column ",trend," not found in de$genes") trend <- TRUE } else stop("trend is neither logical, numeric nor character") } } # Check for NA gene IDs or covariate values if(anyNA(universe) || anyNA(covariate)) { isna <- is.na(universe) if(all(isna)) stop("Gene IDs are all NA") if(trend) { isna[is.na(covariate)] <- TRUE covariate <- covariate[!isna] } universe <- universe[!isna] de <- de[!isna,] } # Check FDR if(!is.numeric(FDR) | length(FDR) != 1) stop("FDR must be numeric and of length 1.") if(FDR < 0 || FDR > 1) stop("FDR should be between 0 and 1.") # Get up and down DE genes fdr.coef <- p.adjust(de$p.value[,coef], method = "BH") EG.DE.UP <- universe[fdr.coef < FDR & de$coefficients[,coef] > 0] EG.DE.DN <- universe[fdr.coef < FDR & de$coefficients[,coef] < 0] DEGenes <- list(Up=EG.DE.UP, Down=EG.DE.DN) # If no DE genes, return data.frame with 0 rows if( identical(length(EG.DE.UP),0L) && identical(length(EG.DE.DN),0L) ) { message("No DE genes") return(data.frame()) } goana(de=DEGenes, universe = universe, covariate=covariate, ...) } goana.default <- function(de, universe = NULL, species = "Hs", null.prob = NULL, covariate=NULL, plot=FALSE, ...) # Gene ontology analysis of DE genes # Gordon Smyth and Yifang Hu # Created 20 June 2014. Last modified 11 Sep 2022. { # Get access to package of GO terms suppressPackageStartupMessages(OK <- requireNamespace("GO.db",quietly=TRUE)) if(!OK) stop("GO.db package required but is not installed (or can't be loaded)") # Get access to required annotation functions suppressPackageStartupMessages(OK <- requireNamespace("AnnotationDbi",quietly=TRUE)) if(!OK) stop("AnnotationDbi package required but is not installed (or can't be loaded)") # Load appropriate organism package orgPkg <- paste0("org.",species,".eg.db") suppressPackageStartupMessages(OK <- requireNamespace(orgPkg,quietly=TRUE)) if(!OK) stop(orgPkg," package required but is not installed (or can't be loaded)") # Get GO to Entrez Gene mappings obj <- paste0("org.",species,".egGO2ALLEGS") egGO2ALLEGS <- tryCatch(getFromNamespace(obj,orgPkg), error=function(e) FALSE) if(is.logical(egGO2ALLEGS)) stop("Can't find gene ontology mappings in package ",orgPkg) # Convert gene-GOterm mappings to data.frame # Remove duplicate entries from both mappings and universe if(is.null(universe)) { GeneID.PathID <- AnnotationDbi::toTable(egGO2ALLEGS)[,c("gene_id","go_id","Ontology")] i <- !duplicated(GeneID.PathID[,c("gene_id", "go_id")]) GeneID.PathID <- GeneID.PathID[i, ] universe <- unique(GeneID.PathID[,1]) if( !(is.null(null.prob) && is.null(covariate)) ) { message("Ignoring covariate and null.prob because universe not set") null.prob <- covariate <- NULL } } else { universe <- as.character(universe) lu <- length(universe) if(!is.null(null.prob) && length(null.prob)!=lu) stop("universe and null.prob must have same length") if(!is.null(covariate) && length(covariate)!=lu) stop("universe and covariate must have same length") if(anyNA(universe) || anyNA(covariate) || anyNA(null.prob)) { isna <- is.na(universe) if(all(isna)) stop("Gene IDs are all NA") if(!is.null(covariate)) { isna[is.na(covariate)] <- TRUE covariate <- covariate[!isna] } if(!is.null(null.prob)) { isna[is.na(null.prob)] <- TRUE null.prob <- null.prob[!isna] } universe <- universe[!isna] } if(anyDuplicated(universe)) { i <- !duplicated(universe) if(!is.null(covariate)) covariate <- covariate[i] if(!is.null(null.prob)) null.prob <- null.prob[i] universe <- universe[i] } # Make universe and set of all annotated genes agree i <- (universe %in% AnnotationDbi::Lkeys(egGO2ALLEGS)) universe <- universe[i] if(!is.null(covariate)) covariate <- covariate[i] if(!is.null(null.prob)) null.prob <- null.prob[i] AnnotationDbi::Lkeys(egGO2ALLEGS) <- universe # Convert GO annotation to data.frame GeneID.PathID <- AnnotationDbi::toTable(egGO2ALLEGS)[,c("gene_id","go_id","Ontology")] d <- duplicated(GeneID.PathID[,c("gene_id", "go_id")]) GeneID.PathID <- GeneID.PathID[!d, ] } # From here, code is mostly the same as kegga.default # Ensure de is a list if(is.list(de)) { if(is.data.frame(de)) stop("de should be a list of character vectors. It should not be a data.frame.") } else { de <- list(DE = de) } nsets <- length(de) # Stop if components of de are not vectors if(!all(vapply(de,is.vector,TRUE))) stop("components of de should be vectors") # Ensure de gene IDs are unique and of character mode for (s in 1:nsets) de[[s]] <- unique(as.character(de[[s]])) # Ensure de components have unique names names(de) <- trimWhiteSpace(names(de)) NAME <- names(de) i <- which(NAME == "" | is.na(NAME)) NAME[i] <- paste0("DE",i) names(de) <- makeUnique(NAME) # Check universe isn't empty NGenes <- length(universe) if(NGenes<1L) stop("No annotated genes found in universe") # Restrict DE genes to universe for (s in 1:nsets) de[[s]] <- de[[s]][de[[s]] %in% universe] # Restrict pathways to universe i <- GeneID.PathID[,1] %in% universe if(sum(i)==0L) stop("Pathways do not overlap with universe") GeneID.PathID <- GeneID.PathID[i,] # Get null.prob trend in DE with respect to the covariate, combining all de lists if(!is.null(covariate)) { if(!is.null(null.prob)) message("null.prob being recomputed from covariate") covariate <- as.numeric(covariate) isDE <- (universe %in% unlist(de)) nDE <- sum(isDE) if(identical(nDE,0L)) { message("No DE genes") null.prob <- covariate <- NULL } else { null.prob <- goanaTrend(isDE,covariate,plot=plot) } } # Overlap pathways with DE genes # Create incidence matrix (X) of gene.pathway by DE sets if(is.null(null.prob)) { X <- matrix(1,nrow(GeneID.PathID),nsets+1) colnames(X) <- c("N",names(de)) } else { names(null.prob) <- universe X <- matrix(1,nrow(GeneID.PathID),nsets+2) X[,nsets+2] <- null.prob[GeneID.PathID[,1]] colnames(X) <- c("N",names(de),"PP") } for (s in 1:nsets) X[,s+1] <- (GeneID.PathID[,1] %in% de[[s]]) # Count #genes and #DE genes and sum null.prob for each pathway S <- rowsum(X, group=GeneID.PathID[,2], reorder=FALSE) # Overlap tests PValue <- matrix(0,nrow=nrow(S),ncol=nsets) colnames(PValue) <- paste("P", names(de), sep=".") nde <- lengths(de, use.names=FALSE) if(!is.null(null.prob)) { # Probability ratio for each pathway vs rest of universe SumPP <- sum(null.prob) M2 <- NGenes-S[,"N"] Odds <- S[,"PP"] / (SumPP-S[,"PP"]) * M2 / S[,"N"] # Wallenius' noncentral hypergeometric test # Note that dWNCHypergeo() is more accurate than pWNCHypergeo(), hence use of 2-terms if(!requireNamespace("BiasedUrn",quietly=TRUE)) stop("BiasedUrn package required but is not installed (or can't be loaded)") for(j in seq_len(nsets)) for(i in seq_len(nrow(S))) PValue[i,j] <- BiasedUrn::pWNCHypergeo(S[i,1L+j], S[i,"N"], M2[i], nde[j], Odds[i], lower.tail=FALSE) + BiasedUrn::dWNCHypergeo(S[i,1L+j], S[i,"N"], M2[i], nde[j], Odds[i]) # Remove sum of prob column, not needed for output S <- S[,-ncol(S),drop=FALSE] } else { # Fisher's exact test for(j in seq_len(nsets)) PValue[,j] <- phyper(S[,1L+j]-0.5, nde[j], NGenes-nde[j], S[,"N"], lower.tail=FALSE) } # Assemble output GOID <- rownames(S) TERM <- suppressMessages(AnnotationDbi::select(GO.db::GO.db,keys=GOID,columns="TERM")) m <- match(GOID,GeneID.PathID[,2]) Ont <- GeneID.PathID[m,3] Results <- data.frame(Term=TERM[,2], Ont=Ont, S, PValue, stringsAsFactors=FALSE) Results } topGO <- function(results, ontology = c("BP", "CC", "MF"), sort = NULL, number = 20L, truncate.term = NULL, p.value = 1) # Extract top GO terms from goana output # Gordon Smyth and Yifang Hu # Created 20 June 2014. Last modified 22 Dec 2022. { # Check results if(!is.data.frame(results)) stop("results should be a data.frame.") # Check ontology ontology <- match.arg(unique(ontology), c("BP", "CC", "MF"), several.ok = TRUE) # Limit results to specified ontologies if(length(ontology) < 3L) { sel <- results$Ont %in% ontology results <- results[sel,] } dimres <- dim(results) # Check number if(!is.numeric(number)) stop("number should be a positive integer") if(number > dimres[1]) number <- dimres[1] if(number < 1L) return(results[integer(0),]) # Number of gene lists for which results are reported # Lists are either called "Up" and "Down" or have user-supplied names nsets <- (dimres[2L]-3L) %/% 2L if(nsets < 1L) stop("results has wrong number of columns") setnames <- colnames(results)[4L:(3L+nsets)] # Check sort. Defaults to all gene lists. if(is.null(sort)) { isort <- 1L:nsets } else { sort <- as.character(sort) isort <- which(tolower(setnames) %in% tolower(sort)) if(!length(isort)) stop("sort column not found in results") } # Filter by minimum p-value for specified gene lists P.col <- 3L+nsets+isort if(length(P.col)==1L) { P <- results[,P.col] } else { P <- do.call("pmin",as.data.frame(results[,P.col,drop=FALSE])) } if(p.value < 1) number <- min(number, sum(P <= p.value)) if(number < 1L) return(results[integer(0),]) # Sort by minimum p-value o <- order(P,results$N,results$Term) tab <- results[o[1L:number],,drop=FALSE] # Truncate Term column for readability if(!is.null(truncate.term)) { truncate.term <- as.integer(truncate.term[1]) truncate.term <- max(truncate.term,5L) truncate.term <- min(truncate.term,1000L) tm2 <- truncate.term-3L i <- (nchar(tab$Term) > tm2) tab$Term[i] <- paste0(substring(tab$Term[i],1L,tm2),"...") } tab }