################################################################################ # UTILITY FUNCTIONS ################################################################################ # These are functions copied over from my repository of utilities used # by this package. They are repeated here simply for portability, so this # package can be deployed on systems without access to my utilities. # Any changes should probably be backported to the primary functions rather # than in these convenience duplications. # # These functions should probably remain interior to the package (not exported) # #' setSharedDataDir #' Sets global variable specifying the default data directory. #' #' @export #' @param sharedDataDir directory where the shared data is stored. #' @return No return value. #' @examples #' setSharedDataDir("project/data") setSharedDataDir = function(sharedDataDir) { options(SHARE.DATA.DIR=sharedDataDir) return() } countOverlapsAny = function(subj, quer, cores=1) { setLapplyAlias(cores) l = unlist(lapplyAlias(subj, function(x) { sum(overlapsAny(x, quer)) } )) return(l) } ################################################################################ # INTERNAL FUNCTIONS - not exported ################################################################################ #' cleanws takes multi-line, code formatted strings and just formats them #' as simple strings #' @param string string to clean #' @return A string with all consecutive whitespace characters, including #' tabs and newlines, merged into a single space. cleanws = function(string) { return(gsub('\\s+'," ", string)) } #' This will change the string in filename to have a new extension #' @param filename string to convert #' @param extension new extension #' @return Filename with original extension deleted, replaced by provided #' extension replaceFileExtension = function(filename, extension) { sub("\\..*$", enforceEdgeCharacter(extension, prependChar="."), paste0(filename, ".")) } #' Just a reverser. Reverses the order of arguments and passes them #' untouched to countOverlapsAny -- so you can use it with lapply. #' #' @param subj Subject #' @param quer Query #' @return Results from countOverlaps countOverlapsAnyRev = function(subj, quer) { countOverlapsAny(quer, subj) } #' converts a list of GRanges into a GRangesList; strips all metadata. #' @param lst a list of GRanges objects #' @return a GRangesList object listToGRangesList = function(lst) { if(!is(lst, "GRangesList")) { if ("list" %in% class(lst)) { #strip elementMetadata lst = lapply(lst, function(x) { values(x) <- NULL; return(x) } ) lst = GRangesList(lst) } else { message(cleanws("Converting GRanges to GRangesList.")) lst = GRangesList(list(lst)) } } return(lst) } #' Wrapper of write.table that provides defaults to write a #' simple .tsv file. Passes additional arguments to write.table #' #' @param ... Additional arguments passed to write.table #' @return No return value write.tsv = function(...) { write.table(..., sep="\t", row.names=FALSE, quote=FALSE) } ###################################################################### #' Imports bed files and creates GRanges objects, using the fread() #' function from data.table. #' #' @param file File name of bed file. #' @return GRanges Object #' @export #' @examples #' a = readBed(system.file("extdata", "examples/combined_regions.bed", #' package="LOLA")) readBed = function(file) { DT = fread(file) # bed specification says: # 1=chr, 2=start, 3=end, 4=name, 5=score (discarded), 6=strand. cn = rep(NA, 6) readCols = colnames(DT) cn[seq_along(readCols)] = readCols # BED files are 0-based, but internal bioc representations use 1-based coords. DT[, (colnames(DT)[2]) := (get(colnames(DT)[2]) + 1)] # Convert any unknown strands to '*' DT[, (cn[3])] tfbsgr = dtToGr(DT, chr=cn[1], start=cn[2], end=cn[3], name=cn[4], strand=cn[6]) return(tfbsgr) } #Two utility functions for converting data.tables into GRanges objects #genes = dtToGR(gModels, "chr", "txStart", "txEnd", "strand", "geneId") dtToGrInternal = function(DT, chr, start, end=NA, strand=NA, name=NA, metaCols=NA) { if (is.na(end)) { end = start } if (is.na(strand)) { gr=GRanges( seqnames=DT[[`chr`]], ranges=IRanges(start=DT[[`start`]], end=DT[[`end`]]), strand="*" ) } else { # GRanges can only handle '*' for no strand, so replace any non-accepted # characters with '*' DT[,strand:=as.character(strand)] DT[strand=="1", strand:="+"] DT[strand=="-1", strand:="-"] DT[[`strand`]] = gsub("[^+-].*", "*", DT[[`strand`]]) gr=GRanges( seqnames=DT[[`chr`]], ranges=IRanges(start=DT[[`start`]], end=DT[[`end`]]), strand=DT[[`strand`]] ) } if (! is.na(name) ) { names(gr) = DT[[`name`]] } else { names(gr) = seq_along(gr) } # This code is not used in LOLA: #if(! is.na(metaCols)) { # for(x in metaCols) { # elementMetadata(gr)[[`x`]]=DT[[`x`]] # } #} gr } dtToGr = function(DT, chr="chr", start="start", end=NA, strand=NA, name=NA, splitFactor=NA, metaCols=NA) { if(is.na(splitFactor)) { return(dtToGrInternal(DT, chr, start, end, strand, name,metaCols)) } if ( length(splitFactor) == 1 ) { if( splitFactor %in% colnames(DT) ) { splitFactor = DT[, get(splitFactor)] } } lapply(split(seq_len(nrow(DT)), splitFactor), function(x) { dtToGrInternal(DT[x,], chr, start, end, strand, name,metaCols) } ) } # If you want to use the GenomicRanges countOverlaps function, but you want to # do it in an lapply, that will work... but you can only do it in one direction. # If you want to lapply on the opposite argument, you can't do it (because # countOverlaps is not symmetric: it depends on which argument comes first). # If you want to do an lapply, but countOverlaps with the query as the second # argument instead of the first, you can use this function to simply reverse # the order of the arguments. # This is used in the enrichment calculations (originally from the EWS # project; 2014, CeMM). countOverlapsRev = function(query, subject, ...) { return(countOverlaps(subject, query, ...)) } # For windows, let's try this. countFileLinesBackup = function(filename) { # From https://stackoverflow.com/a/23456450/946721 f = file(filename, open="rb") nlines <- 0L while (length(chunk <- readBin(f, "raw", 65536)) > 0) { nlines = nlines + sum(chunk == as.raw(10L)) } close(f) return(as.numeric(nlines)) } # Parses result of system "wc" wordcount to return the number of lines # in a file into R. countFileLines = function(filename) { if (!file.exists(filename)) { warning("File does not exist:", filename) return(0) } nlines = tryCatch( { return(as.numeric( strsplit(sub("^\\s+", "", system(paste("wc -l ", filename), intern=TRUE)), " ")[[1]][1] )) }, error= function(err) { return(NA) } ) if (is.na(nlines)) { return(countFileLinesBackup(filename)) } } #' Function to sample regions from a GRangesList object, in specified proportion #' #' @param GRL GRangesList from which to sample #' @param prop vector with same length as GRL, of values between 0-1, #' proportion of the list to select #' #' @return A sampled subset of original GRangesList object. sampleGRL = function(GRL, prop) { sampleGRanges = function(GR, prop) { GR[sample(length(GR), floor(length(GR) * prop))] } mapply(sampleGRanges, GRL, prop) } #' To make parallel processing a possibility but not required, #' I use an lapply alias which can point at either the base lapply #' (for no multicore), or it can point to mclapply, #' and set the options for the number of cores (what mclapply uses). #' With no argument given, returns intead the number of cpus currently selected. #' #' @param cores Number of cpus #' @return None setLapplyAlias = function(cores=0) { if (cores < 1) { return(getOption("mc.cores")) } if(cores > 1) { #use multicore? if (requireNamespace("parallel", quietly = TRUE)) { options(mc.cores=cores) } else { warning("You don't have package parallel installed. Setting cores to 1.") options(mc.cores=1) #reset cores option. } } else { options(mc.cores=1) #reset cores option. } } #' Function to run lapply or mclapply, depending on the option set in #' getOption("mc.cores"), which can be set with setLapplyAlias(). #' #' @param ... Arguments passed lapply() or mclapply() #' @param mc.preschedule Argument passed to mclapply #' @return Result from lapply or parallel::mclapply lapplyAlias = function(..., mc.preschedule=TRUE) { if (is.null(getOption("mc.cores"))) { setLapplyAlias(1) } if(getOption("mc.cores") > 1) { return(parallel::mclapply(..., mc.preschedule=mc.preschedule)) } else { return(lapply(...)) } } #check for, and fix, trailing slash. if necessary enforceTrailingSlash = function(folder) { enforceEdgeCharacter(folder, appendChar="/") } enforceEdgeCharacter = function(string, prependChar="", appendChar="") { if (string=="" | is.null(string)) { return(string) } if(!is.null(appendChar)) { if (substr(string,nchar(string), nchar(string)) != appendChar) { # +1 ? string = paste0(string, appendChar) } } if (!is.null(prependChar)) { if (substr(string,1,1) != prependChar) { # +1 ? string = paste0(prependChar, string) } } return(string) } #' Named list function. #' #' This function is a drop-in replacement for the base list() function, #' which automatically names your list according to the names of the #' variables used to construct it. #' It seemlessly handles lists with some names and others absent, #' not overwriting specified names while naming any unnamed parameters. #' Took me awhile to figure this out. #' #' @param ... arguments passed to list() #' @return A named list object. nlist = function(...) { fcall = match.call(expand.dots=FALSE) l = list(...) if(!is.null(names(list(...)))) { names(l)[names(l) == ""] = fcall[[2]][names(l) == ""] } else { names(l) = fcall[[2]] } return(l) }