#########################################################################/** # @RdocFunction readCelUnits # # @title "Reads probe-level data ordered as units (probesets) from one or several Affymetrix CEL files" # # @synopsis # # \description{ # @get "title" by using the unit and group definitions in the # corresponding Affymetrix CDF file. # } # # \arguments{ # \item{filenames}{The filenames of the CEL files.} # \item{units}{An @integer @vector of unit indices specifying which # units to be read. If @NULL, all units are read.} # \item{stratifyBy}{Argument passed to low-level method # @see "affxparser::readCdfCellIndices".} # \item{cdf}{A @character filename of a CDF file, or a CDF @list # structure. If @NULL, the CDF file is searched for by # @see "findCdf" first starting from the current directory and # then from the directory where the first CEL file is.} # \item{...}{Arguments passed to low-level method # @see "affxparser::readCel", e.g. \code{readXY} and \code{readStdvs}.} # \item{addDimnames}{If @TRUE, dimension names are added to arrays, # otherwise not. The size of the returned CEL structure in bytes # increases by 30-40\% with dimension names.} # \item{dropArrayDim}{If @TRUE and only one array is read, the elements of # the group field do \emph{not} have an array dimension.} # \item{transforms}{A @list of exactly \code{length(filenames)} # @functions. If @NULL, no transformation is performed. # Intensities read are passed through the corresponding transform # function before being returned.} # \item{readMap}{A @vector remapping cell indices to file indices. # If @NULL, no mapping is used.} # \item{verbose}{Either a @logical, a @numeric, or a @see "R.utils::Verbose" # object specifying how much verbose/debug information is written to # standard output. If a Verbose object, how detailed the information is # is specified by the threshold level of the object. If a numeric, the # value is used to set the threshold of a new Verbose object. If @TRUE, # the threshold is set to -1 (minimal). If @FALSE, no output is written # (and neither is the \pkg{R.utils} package required). # } # } # # \value{ # A named @list with one element for each unit read. The names # corresponds to the names of the units read. # Each unit element is in # turn a @list structure with groups (aka blocks). # Each group contains requested fields, e.g. \code{intensities}, # \code{stdvs}, and \code{pixels}. # If more than one CEL file is read, an extra dimension is added # to each of the fields corresponding, which can be used to subset # by CEL file. # # Note that neither CEL headers nor information about outliers and # masked cells are returned. To access these, use @see "readCelHeader" # and @see "readCel". # } # # @author "HB" # # @examples "../incl/readCelUnits.Rex" # # \seealso{ # Internally, @see "readCelHeader", @see "readCdfUnits" and # @see "readCel" are used. # } # # \references{ # [1] Affymetrix Inc, Affymetrix GCOS 1.x compatible file formats, # June 14, 2005. # \url{http://www.affymetrix.com/support/developer/} # } # # @keyword "file" # @keyword "IO" #*/######################################################################### readCelUnits <- function(filenames, units=NULL, stratifyBy=c("nothing", "pmmm", "pm", "mm"), cdf=NULL, ..., addDimnames=FALSE, dropArrayDim=TRUE, transforms=NULL, readMap=NULL, verbose=FALSE) { # To please R CMD check Arguments <- enter <- exit <- NULL; rm(list=c("Arguments", "enter", "exit")); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Local functions # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - qsort <- function(x) { ## o0 <- .Internal(qsort(x, TRUE)); ## o <- sort.int(x, index.return=TRUE, method="quick"); ## stopifnot(identical(o, o0)); sort.int(x, index.return=TRUE, method="quick"); } # qsort() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Validate arguments # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Argument 'filenames': filenames <- file.path(dirname(filenames), basename(filenames)); missing <- filenames[!file.exists(filenames)]; if (length(missing)) { stop("File(s) not found: ", paste(missing, collapse=", ")); } # Argument 'units' and 'cdf': if (is.list(cdf) && !is.null(units)) { stop("Arguments 'units' must not be specified if argument 'cdf' is a CDF list structure."); } # Argument 'units': if (is.null(units)) { } else if (is.numeric(units)) { units <- as.integer(units); # Unit indices are one-based in R if (any(units < 1L)) stop("Argument 'units' contains non-positive indices."); } else { stop("Argument 'units' must be numeric or NULL: ", class(units)[1]); } # Argument 'cdf': searchForCdf <- FALSE; if (is.null(cdf)) { searchForCdf <- TRUE; } else if (is.character(cdf)) { cdfFile <- file.path(dirname(cdf), basename(cdf)); if (!file.exists(cdfFile)) stop("File not found: ", cdfFile); cdf <- NULL; } else if (is.list(cdf)) { aUnit <- cdf[[1L]]; if (!is.list(aUnit)) stop("Argument 'cdf' is of unknown format: First unit is not a list."); groups <- aUnit$groups; if (!is.list(groups)) stop("Argument 'cdf' is of unknown format: Units Does not contain the list 'groups'."); extractGroups <- (length(names(aUnit)) > 1L); # Check for group fields 'indices' or 'x' & 'y' in one of the groups. aGroup <- groups[[1]]; extractFields <- TRUE; fields <- names(aGroup); if ("indices" %in% fields) { cdfType <- "indices"; extractFields <- (length(fields) > 1L); } else if (all(c("x", "y") %in% fields)) { # The CDF is needed in order to know the (x,y) dimensions of the # chip so that one can calculate (x,y) -> cell index. cdfType <- "x"; searchForCdf <- TRUE; } else { stop("Argument 'cdf' is of unknown format: The groups contains neither the fields 'indices' nor ('x' and 'y')."); } aUnit <- groups <- aGroup <- NULL; # Not needed anymore } else { stop("Argument 'cdf' must be a filename, a CDF list structure or NULL: ", mode(cdf)); } # Argument 'readMap': if (!is.null(readMap)) { # Cannot check map indices without knowing the array. Is it worth # reading such details already here? } # Argument 'dropArrayDim': dropArrayDim <- as.logical(dropArrayDim); # Argument 'addDimnames': addDimnames <- as.logical(addDimnames); nbrOfArrays <- length(filenames); # Argument 'transforms': if (is.null(transforms)) { hasTransforms <- FALSE; } else if (is.list(transforms)) { if (length(transforms) != nbrOfArrays) { stop("Length of argument 'transforms' does not match the number of arrays: ", length(transforms), " != ", nbrOfArrays); } for (transform in transforms) { if (!is.function(transform)) stop("Argument 'transforms' must be a list of functions."); } hasTransforms <- TRUE; } else { stop("Argument 'transforms' must be a list of functions or NULL."); } # Argument 'stratifyBy': stratifyBy <- match.arg(stratifyBy); # Argument 'verbose': (Utilized the Verbose class in R.utils if available) if (!identical(verbose, FALSE)) { requireNamespace("R.utils") || stop("Package not loaded: R.utils"); Arguments <- R.utils::Arguments enter <- R.utils::enter exit <- R.utils::exit verbose <- Arguments$getVerbose(verbose); } cVerbose <- -(as.numeric(verbose) + 2); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 0. Search for CDF file? # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if (searchForCdf) { verbose && enter(verbose, "Searching for CDF file"); verbose && enter(verbose, "Reading chip type from first CEL file"); celHeader <- readCelHeader(filenames[1L]); chipType <- celHeader$chiptype; verbose && exit(verbose); verbose && enter(verbose, "Searching for chip type '", chipType, "'"); cdfFile <- findCdf(chipType=chipType); if (length(cdfFile) == 0L) { # If not found, try also where the first CEL file is opwd <- getwd(); on.exit(setwd(opwd)); setwd(dirname(filenames[1L])); cdfFile <- findCdf(chipType=chipType); setwd(opwd); } verbose && exit(verbose); if (length(cdfFile) == 0L) stop("No CDF file for chip type found: ", chipType); verbose && exit(verbose); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 1. Read cell indices for units of interest from the CDF file? # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if (is.null(cdf)) { # verbose && enter(verbose, "Reading cell indices from CDF file"); cdf <- readCdfCellIndices(cdfFile, units=units, stratifyBy=stratifyBy, verbose=FALSE); # verbose && exit(verbose); # Assume 'cdf' contains only "indices" fields. indices <- unlist(cdf, use.names=FALSE); } else { if (cdfType == "indices") { # Clean up CDF list structure from other elements than groups? if (extractGroups) { # verbose && enter(verbose, "Extracting groups..."); cdf <- lapply(cdf, FUN=function(unit) list(groups=unit$groups)); # verbose && exit(verbose); } # Clean up CDF list structure from other group fields than "indices"? if (extractFields) { # verbose && enter(verbose, "Extracting fields..."); cdf <- applyCdfGroups(cdf, cdfGetFields, fields="indices"); # verbose && exit(verbose); } indices <- unlist(cdf, use.names=FALSE); } else { verbose && enter(verbose, "Calculating cell indices from (x,y) positions"); verbose && enter(verbose, "Reading chip layout from CDF file"); cdfHeader <- readCdfHeader(cdfFile); ncol <- cdfHeader$cols; verbose && exit(verbose); # Clean up CDF list structure from other elements than groups cdf <- lapply(cdf, FUN=function(unit) list(groups=unit$groups)); x <- unlist(applyCdfGroups(cdf, cdfGetFields, "x"), use.names=FALSE); y <- unlist(applyCdfGroups(cdf, cdfGetFields, "y"), use.names=FALSE); # Cell indices are one-based in R indices <- as.integer(y * ncol + x + 1); x <- y <- NULL; # Not needed anymore verbose && exit(verbose); } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 2. Remapping cell indices? # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if (!is.null(readMap)) { verbose && enter(verbose, "Remapping cell indices"); indices <- readMap[indices]; verbose && exit(verbose); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 2b. Order cell indices for optimal speed when reading, i.e. minimal # jumping around in the file. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - reorder <- TRUE; # Hardwired from now on. if (reorder) { verbose && enter(verbose, "Reordering cell indices to optimize speed"); # qsort() is about 10-15 times faster than using order()! # WAS: o <- .Internal(qsort(indices, TRUE)); # From base::sort.int() o <- qsort(indices); indices <- o$x; # WAS: o <- .Internal(qsort(o$ix, TRUE))$ix; # From base::sort.int() o <- qsort(o$ix)$ix; verbose && exit(verbose); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 3. Read signals of the cells of interest from the CEL file(s) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Comment: We assign elements of CEL list structure to local environment, # because calling cel[[field]][idxs,] multiple (=nbrOfUnits) times is very # slow whereas get(field) is much faster (about 4-6 times actually!) # /HB 2006-03-24 nbrOfCells <- length(indices); nbrOfUnits <- length(cdf); # Because integer 'nbrOfCells*nbrOfArrays' may overflow to NA, we corce to double # here. See aroma.affymetrix thread 'Speeding up RmaBackgroundCorrection' on # 2014-02-27 for background/details. # FIXME: Ideally, this function should be rewritten to read signals and group them # into CEL units in chunks. /HB 2014-02-27 nbrOfEntries <- as.double(nbrOfCells) * as.double(nbrOfArrays); stopifnot(is.finite(nbrOfEntries)); verbose && enter(verbose, "Reading ", nbrOfUnits, "*", nbrOfCells/nbrOfUnits, "=", nbrOfCells, " cells from ", nbrOfArrays, " CEL files"); # Cell-value elements cellValueFields <- c("x", "y", "intensities", "stdvs", "pixels"); integerFields <- "pixels"; doubleFields <- setdiff(cellValueFields, integerFields); # Local environment where to store the temporary variables env <- environment(); for (kk in seq_len(nbrOfArrays)) { filename <- filenames[kk]; verbose && enter(verbose, "Reading CEL data for array #", kk); celTmp <- readCel(filename, indices=indices, readHeader=FALSE, readOutliers=FALSE, readMasked=FALSE, ..., readMap=NULL, verbose=cVerbose, .checkArgs=FALSE); verbose && exit(verbose); if (kk == 1L) { verbose && enter(verbose, "Allocating return structure"); # Allocate the return list structure # celTmp$header <- NULL; celFields <- names(celTmp); # Update list of special fields cellValueFields <- intersect(celFields, cellValueFields); doubleFields <- intersect(cellValueFields, doubleFields); integerFields <- intersect(cellValueFields, integerFields); # Allocate all field variables dim <- c(nbrOfCells, nbrOfArrays); value <- vector("double", length=nbrOfEntries); dim(value) <- dim; for (name in doubleFields) assign(name, value, envir=env, inherits=FALSE); value <- NULL; # Not needed anymore value <- vector("integer", length=nbrOfEntries); dim(value) <- dim; for (name in integerFields) assign(name, value, envir=env, inherits=FALSE); value <- NULL; # Not needed anymore verbose && exit(verbose); } for (name in cellValueFields) { # Extract field values and re-order them again value <- celTmp[[name]]; if (is.null(value)) next; # "Re-reorder" cells read if (reorder) value <- value[o]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Transform signals? # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if (hasTransforms && name == "intensities") { verbose && enter(verbose, "Transform signals for array #", kk); value <- transforms[[kk]](value); verbose && exit(verbose); } eval(substitute(name[,kk] <- value, list(name=as.name(name)))); value <- NULL; # Not needed anymore } # for (name ...) celTmp <- NULL; # Not needed anymore } verbose && exit(verbose); indices <- NULL; # Not needed anymore # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 3. Structure CEL data in units and groups according to the CDF file # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - verbose && enter(verbose, "Structuring data by units and groups"); fields <- vector("list", length=length(cellValueFields)); names(fields) <- cellValueFields; # Keep a copy for groups with empty fields emptyFields <- fields; # Add a dimension for the arrays, unless only one array is read # and the array dimension is not wanted. addArrayDim <- (nbrOfArrays >= 2L || !dropArrayDim); seqOfArrays <- list(seq_len(nbrOfArrays)); offset <- 0L; res <- lapply(cdf, FUN=function(u) { lapply(.subset2(u, "groups"), FUN=function(g) { # Same dimensions of all fields field <- .subset2(g, 1L); # Faster than g[[1L]] ncells <- length(field); # Empty unit group? if (ncells == 0L) return(emptyFields); idxs <- offset + 1:ncells; offset <<- offset + ncells; # Get the target dimension dim <- dim(field); if (is.null(dim)) dim <- ncells; if (addDimnames) { dimnames <- dimnames(field); if (is.null(dimnames)) dimnames <- list(seq_len(dim)); # Add an extra dimension for arrays? if (addArrayDim) { dim <- c(dim, nbrOfArrays); dimnames <- c(dimnames, seqOfArrays); } # Update all fields with dimensions setDim <- (length(dim) > 1L); for (name in cellValueFields) { # Faster to drop dimensions. values <- get(name, envir=env, inherits=FALSE)[idxs,,drop=TRUE]; if (setDim) { dim(values) <- dim; dimnames(values) <- dimnames; } else { names(values) <- dimnames; } fields[[name]] <- values; values <- NULL; # Not needed anymore } } else { # Add an extra dimension for arrays? if (addArrayDim) dim <- c(dim, nbrOfArrays); # Update all fields with dimensions setDim <- (length(dim) > 1L); for (name in cellValueFields) { # Faster to drop dimensions. values <- get(name, envir=env, inherits=FALSE)[idxs,,drop=TRUE]; if (setDim) dim(values) <- dim; fields[[name]] <- values; values <- NULL; # Not needed anymore } } # if (addDimnames) fields; }) # lapply(.subset2(u, "groups"), ...); }) # lapply(cdf, ...) verbose && exit(verbose); res; } ############################################################################ # HISTORY: # 2014-02-27 [HB] # o ROBUSTNESS: Using integer constants (e.g. 1L) where applicable. # o ROBUSTNESS: Using explicitly named arguments in more places. # 2012-05-22 [HB] # o CRAN POLICY: readCel() and readCelUnits() are no longer calling # .Internal(qsort(...)). # 2007-12-01 [HB] # o Removed argument 'reorder' from readCelUnits(). Reordering is now always # done. # 2007-06-28 [HB] # o Removed the name of the second argument in a substitute() call. # 2007-02-01 [KS] # o Now readCelUnits() can handle unit groups for which there are no probes, # e.g. when stratifying on PM in a unit containing only MMs. # 2006-10-04 [HB] # o Made readCelUnits() a bit more clever if a 'cdf' structure with only # cell indices is passed. Then all fields are just indices and one can # call unlist immediately. This speeds things up a bit. # 2006-05-12 [HB] # o Rearranged order of arguments such that the most often used/most user- # friendly arguments come first. This was done as a first step after # our developers meeting yesterday. # 2006-04-18 [HB] # o BUG FIX: When argument 'cdf' was a CDF list structure with elements # 'type' or 'direction', readCelUnits() would not read the correct cells # because the values of 'type' and 'direction' would be included in the # extracted list of cell indices. # 2006-04-15 [HB] # o BUG FIX: Passed '...' to both readCdfCellIndices() and readCel(), but # should only be passed to the latter. # 2006-04-01 [HB] # o Using readCdfCellIndices() instead of readCdfUnits(). Faster! # o Added argument 'reorder'. If TRUE, all cells are read in order to # minimize the jumping around in the file. This speeds things up a lot! # I tried this last week, but for some reason I did not see a difference. # 2006-03-29 [HB] # o Renamed argument 'map' to 'readMap'. # 2006-03-28 [HB] # o Unit and cell indices are now one-based. # o Renamed argument 'readCells' to 'readIndices' and same with the name of # the returned group field. # 2006-03-26 [HB] # o Now only "x", "y", "intensities", "pixels", and "stdvs" values are # returned. # 2006-03-24 [HB] # o Made the creation of the final CEL structure according to the CDF much # faster. Now it is about 4-6 times faster utilizing get(field) instead # of cel[[field]]. # o Tried to reorder cell indices in order to minimize jumping around in the # file, but there seems to be no speed up at all doing this. Strange! # 2006-03-14 # o Updated code to make use of package R.utils only if it is available. # 2006-03-08 # o Removed the usage of Arguments of R.utils. This is because we might # move this function to the affxparser package. Still to be removed is # the use of the Verbose class. # 2006-03-04 # o Added argument 'map'. # o Removed all gc(). They slow down quite a bit. # 2006-02-27 [HB] # o BUG FIX: It was only stratifyBy="pmmm" that worked if more than one # array was read. # 2006-02-23 [HB] # o The restructuring code is now more generic, e.g. it does not require # the 'stratifyBy' argument and can append multiple arrays of any # dimensions. # o Now the CDF file is search for where the CEL files lives too. # 2006-02-22 [HB] # o First test where argument 'cdf' is a CDF structure. Fixed some bugs, # but it works now. # o Simple redundancy test: The new code and the updated affxparser package # works with the new aroma.affymetrix/rsp/ GUI. # o Now argument 'cdf' is checked to contain either 'cells' or 'x' & 'y' # group fields. If 'x' and 'y', the cell indices are calculated from # (x,y) and the chip layout obtained from the header of CDF file, which # has been searched for. # 2006-02-21 [HB] # o TO DO: Re implement all of this in a C function to speed things up # further; it is better to put values in the right position from the # beginning. # o Added arguments 'transforms' to be able to transform all probe signals # at once. This improves the speed further. # o Removed annotation of PM/MM dimension when 'stratifyBy="pmmm", because # the resulting object increases ~33% in size. # o Improved speed for restructuring cell data about 20-25%. # o Now it is possible to read multiple CEL files. # o Making use of new 'readCells' in readCdfUnits(), which is much faster. # o Replaced argument 'splitPmMm' with 'stratifyBy'. This speeds up if # reading PM (or MM) only. # o BUG FIX: 'splitPmMm=TRUE' allocated PMs and MMs incorrectly. The reason # was that the indices are already in the correct PM/MM order from the # splitPmMm in readCdfUnits() from which the (x,y) to cell indices are # calculated. # o Updated to make use of the latest affxparser. # 2006-01-24 [HB] # o BUG FIX: Made some modifications a few days ago that introduced missing # variables etc. # 2006-01-21 [HB] # o Added Rdoc comments. # 2006-01-16 [HB] # o Created by HB. ############################################################################