#' readPipelineResults #' #' @param path The path (e.g. folder or prefix) to the results. Either `path` #' or `resfiles` should be given. #' @param resfiles A vector of paths to `*.evaluation.rds` files. Either `path` #' or `resfiles` should be given. #' #' @return A list of results. #' @examples #' # we produce mock pipeline results: #' pip <- mockPipeline() #' datasets <- list( ds1=1:3, ds2=c(5,10,15) ) #' tmpdir1 <- paste0(tempdir(),'/') #' res <- runPipeline(datasets, pipelineDef=pip, output.prefix=tmpdir1, #' alternatives=list() ) #' # we read the evaluation files: #' res <- readPipelineResults(tmpdir1) #' @export readPipelineResults <- function(path = NULL, resfiles = NULL) { if ( (!is.null(path) && !is.null(resfiles)) || (is.null(path) && is.null(resfiles)) ) stop("Exactly one of `path` or `resfiles` should be given.") if (!is.null(path)) { resfiles <- list.files(path, pattern="evaluation\\.rds", full.names=TRUE) if (length(resfiles) == 0) resfiles <- list.files(dirname(path), full.names=TRUE, pattern=paste0(gsub("\\.","\\\\.",basename(path)), ".*\\.evaluation\\.rds$")) if (length(resfiles) == 0) stop("Could not find evaluation files.") } ds <- vapply(strsplit(basename(resfiles), "\\."), FUN=function(x) rev(x)[3], FUN.VALUE=character(1)) if (any(duplicated(ds))) stop("Some datasets appear to occur in more than one file. If these are ", "the result of multiple `runPipeline` calls, please read them ", "separately and use `mergePipelineResults()`.") names(resfiles) <- ds lapply(resfiles, readRDS) } #' aggregatePipelineResults #' #' Aggregates the evaluation and running times of `runPipeline` results. #' Results should be indicated either as a `path`` prefix or as a vector of #' paths to `evaluation\\.rds` files (`resfiles`). #' #' @param res A (named) list of results (per dataset), as produced by #' \code{\link{readPipelineResults}} (or `mergePipelineResults`). #' @param pipDef An optional \code{\link{PipelineDefinition}} containing the #' aggregation methods. If omitted, that from the results will be used. #' #' @return A list with a slot for each step for which there is an aggregation #' method, or (if no aggregation method available) a list of the #' `stepIntermediateReturnObjects` of `runPipeline` #' #' @import S4Vectors #' @export #' @examples #' # we produce mock pipeline results: #' pip <- mockPipeline() #' datasets <- list( ds1=1:3, ds2=c(5,10,15) ) #' tmpdir1 <- paste0(tempdir(),'/') #' res <- runPipeline(datasets, pipelineDef=pip, output.prefix=tmpdir1, #' alternatives=list() ) #' # we read the evaluation files: #' res <- readPipelineResults(tmpdir1) #' # we aggregate the results (equivalent to the output of `runPipeline`): #' res <- aggregatePipelineResults(res) aggregatePipelineResults <- function(res, pipDef = NULL) { if (length(res) == 2 && all(names(res) == c("evaluation", "elapsed"))) stop("`res` appears to be already aggregated, or the results of a single", " dataset") .checkRes(res, requirePDidentity = is.null(pipDef)) if (is.null(pipDef)) pipDef <- metadata(res[[1]])$PipelineDefinition if (is.null(pipDef)) stop("No PipelineDefinition found!") # aggregating elapsed time names(steps) <- steps <- names(res[[1]]$elapsed$stepwise) reso <- SimpleList( evaluation=NULL, elapsed=list(stepwise=lapply(steps, FUN=function(step){ .aggElapsed(lapply(res, FUN = function(x) x$elapsed$stepwise[[step]])) }), total=.aggElapsed(lapply(res, FUN=function(x) x$elapsed$total))) ) metadata(reso)$PipelineDefinition <- pipDef isn <- vapply(stepFn(pipDef, type = "aggregation"), is.null, logical(1)) if (all(isn)) { warning("No aggregation defined in the pipelineDefinition; returning only", " running times.") return(reso) } names(isn) <- isn <- names(isn)[!isn] res <- lapply(res, FUN = function(x) x$evaluation) fullnames <- parsePipNames(names(res[[1]][[length(res[[1]])]])) reso$evaluation <- lapply(isn, FUN = function(x) { message("Aggregating evaluation results for: ", x) stepFn(pipDef,type="aggregation")[[x]](lapply(res, FUN=function(y) y[[x]])) }) reso } .aggElapsed <- function(res) { el.tot <- parsePipNames(names(res[[1]])) el.tot <- el.tot[rep(seq_len(nrow(el.tot)), length(res)), , drop = FALSE] row.names(el.tot) <- NULL el.tot$dataset <- factor(rep(names(res), each = length(res[[1]]))) el.tot$elapsed <- as.numeric(unlist(res)) el.tot } .checkRes <- function(res1, res2 = NULL, requirePDidentity = TRUE) { res1 <- lapply(res1, FUN = function(x) x$evaluation) # check that all datasets have the same steps and number of analyses if (!all(table(unlist(lapply(res1, names))) == length(res1))) stop("The different datasets were not produced with the same pipeline!") for (step in names(res1[[1]])) { tt <- table(unlist(lapply(res1, FUN = function(x) names(x[[step]])))) if (!all(tt == length(res1))) stop("The different datasets do not have the same runs, i.e. they ", "include different sets of alternative parameters.") } if (!is.null(res2)) { .checkRes(res2) res2 <- lapply(res2, FUN = function(x) x$evaluation) if (names(res1)[[1]] != names(res2)[[1]]) stop("The two sets of results were not produced with the same pipeline.") if (is(res1,"SimpleList") && !is.null(metadata(res1)$PipelineDefinition) && is(res2,"SimpleList") && !is.null(metadata(res2)$PipelineDefinition)) { # we require that the evaluation functions be the same pd1 <- metadata(res1)$PipelineDefinition pd2 <- metadata(res2)$PipelineDefinition if (!identical( stepFn(pd1, type = "evaluation"), stepFn(pd2, type = "evaluation") )) { msg <- paste("The evaluation functions of the PipelineDefinitions are", "not identical") if (requirePDidentity) stop(msg) warning(msg) } } } } #' mergePipelineResults #' #' Merges the (non-aggregated) results of any number of runs of `runPipeline` #' using the same \code{\link{PipelineDefinition}} (but on different datasets #' and/or using different parameters). First read the different sets of results #' using \code{\link{readPipelineResults}}, and pass them to this function. #' #' #' @param ... Any number of lists of pipeline results, each as produced by #' \code{\link{readPipelineResults}} #' @param rr Alternatively, the pipeline results can be passed as a list (in #' which case `...` is ignored) #' @param verbose Whether to print processing information #' #' @return A list of merged pipeline results. #' @examples #' # we produce 2 mock pipeline results: #' pip <- mockPipeline() #' datasets <- list( ds1=1:3, ds2=c(5,10,15) ) #' tmpdir1 <- paste0(tempdir(),'/') #' res <- runPipeline(datasets, pipelineDef=pip, output.prefix=tmpdir1, #' alternatives=list() ) #' alternatives <- list(meth1=c('log2','sqrt'), meth2='cumsum') #' tmpdir2 <- paste0(tempdir(),'/') #' res <- runPipeline(datasets, alternatives, pip, output.prefix=tmpdir2) #' # we read the evaluation files: #' res1 <- readPipelineResults(tmpdir1) #' res2 <- readPipelineResults(tmpdir2) #' # we merge them: #' res <- mergePipelineResults(res1,res2) #' # and we aggregate: #' res <- aggregatePipelineResults(res) #' @export mergePipelineResults <- function(..., rr=NULL, verbose=TRUE){ if(is.null(rr)) rr <- list(...) if(any(vapply(rr, FUN.VALUE=logical(1L), FUN=function(x) is.null(x[[1]]$evaluation)))) stop("Some of the objects appear not to be pipeline results.") if(is.null(names(rr))) names(rr) <- paste0("res",1:length(rr)) for(ds in rr) .checkSameSteps(rr) .checkSameSteps(lapply(rr, FUN=function(x) x[[1]])) if(verbose) print(lapply(rr, FUN=function(res){ vapply(res, FUN.VALUE=integer(1L), FUN=function(x) length(rev(x$evaluation)[[1]]))})) # concatenate results names(ds) <- ds <- unique(unlist(lapply(rr, names))) hasdup <- FALSE res <- lapply(ds, FUN=function(dataset){ r2 <- lapply(rr, FUN=function(x) x[[dataset]]) r2 <- r2[!vapply(r2,is.null,logical(1L))] if(length(r2)==1) return(r2[[1]]) if(!hasdup){ rn <- unlist(lapply(r2, FUN=function(x) names(rev(x$evaluation)[[1]]))) if(any(duplicated(rn))) hasdup <- TRUE } res <- r2[[1]] res$errors <- NULL for(i in seq(2,length(r2))){ x <- setdiff(names(r2[[i]]$elapsed$total), names(res$elapsed$total)) res$elapsed$total <- c(res$elapsed$total, r2[[i]]$elapsed$total[x]) for(step in names(res$evaluation)){ x <- setdiff(names(r2[[i]]$evaluation[[step]]), names(res$evaluation[[step]])) res$evaluation[[step]] <- c(res$evaluation[[step]], r2[[i]]$evaluation[[step]][x]) res$elapsed$stepwise[[step]] <- c(res$elapsed$stepwise[[step]], r2[[i]]$elapsed$stepwise[[step]][x]) } } res }) rm(rr) if(hasdup && verbose) message("Some analyses were redundant across results sets, ", "and only the first given was kept.") # retain only complete parameter sets hasmissing <- FALSE for(step in names(res[[1]]$evaluation)){ a <- table(unlist(lapply(res,FUN=function(x) names(x$evaluation[[step]])))) if(length(a)>0){ if(!all(w <- a==length(res))) hasmissing <- TRUE a <- sort(names(a)[w]) if(length(a)==0) stop("No complete parameter set!") res <- lapply(res, FUN=function(x){ x$evaluation[[step]] <- x$evaluation[[step]][a] x$elapsed$stepwise[[step]] <- x$elapsed$stepwise[[step]][a] if(step==rev(names(x$evaluation))[1]) x$elapsed$total <- x$elapsed$total[a] x }) } } if(hasmissing && verbose) warning("Some parameter combinations were not available for all datasets ", "and were dropped.") if(verbose) print(list(merged=vapply(res, FUN.VALUE=integer(1L), FUN=function(x) length(rev(x$evaluation)[[1]])))) res } .checkSameSteps <- function(x, stopifnot=TRUE){ steps <- table(unlist(lapply(x, FUN=function(x) names(x$evaluation)))) if(all(steps==length(x))) return(TRUE) if(stopifnot) stop("Some datasets have different steps - were then run using", " the same PipelineDefinition?") FALSE } .dsMergeResults <- function(res1, res2) { names(steps) <- steps <- names(res1$evaluation) esteps <- vapply(res1$evaluation, FUN = function(x) { !is.null(x) && length(x) > 0 }, logical(1)) edif <- setdiff(names(res2$elapsed$total), names(res1$elapsed$total)) res <- SimpleList(evaluation = lapply(steps[esteps], FUN = function(step) { r1 <- res1$evaluation[[step]] r2 <- res2$evaluation[[step]] dif <- setdiff(names(r2), names(r1)) c(r1, r2[dif]) }), elapsed = list(stepwise = lapply(steps, FUN = function(step) { r1 <- res1$elapsed$stepwise[[step]] r2 <- res2$elapsed$stepwise[[step]] dif <- setdiff(names(r2), names(r1)) c(r1, r2[dif]) }), total = c(res1$elapsed$total, res2$elapsed$total[edif]))) if (!is.null(metadata(res1)$PipelineDefinition)) { metadata(res)$PipelineDefinition <- metadata(res1)$PipelineDefinition } else if (!is.null(metadata(res2)$PipelineDefinition)) { metadata(res)$PipelineDefinition < metadata(res2)$PipelineDefinition } res } #' defaultStepAggregation #' #' @param x A list of results per dataset, each containing a list (1 element #' per combination of parameters) of evaluation metrics (coercible to vectors #' or matrix). #' #' @return A data.frame. #' @export #' @importFrom dplyr bind_rows defaultStepAggregation <- function(x) { y <- x[[1]][[1]] if (is(y, "list") || is(y, "SimpleList")) { res <- lapply(seq_along(y), FUN = function(i) { defaultStepAggregation(lapply(x, FUN = function(x) { lapply(x, FUN = function(x) x[[i]]) })) }) names(res) <- names(y) return(res) } dplyr::bind_rows(lapply(x, FUN = function(x) { x <- cbind(parsePipNames(names(x)), do.call(rbind, x)) row.names(x) <- NULL x }), .id = "dataset") }