#########################################################################/** # @RdocFunction readCdfCellIndices # # @title "Reads (one-based) cell indices of units (probesets) in an Affymetrix CDF file" # # @synopsis # # \description{ # @get "title". # } # # \arguments{ # \item{filename}{The filename of the CDF file.} # \item{units}{An @integer @vector of unit indices # specifying which units to be read. If @NULL, all units are read.} # \item{stratifyBy}{A \code{\link[base]{character}} string specifying which and how # elements in group fields are returned. # If \code{"nothing"}, elements are returned as is, i.e. as @vectors. # If \code{"pm"}/\code{"mm"}, only elements corresponding to # perfect-match (PM) / mismatch (MM) probes are returned (as @vectors). # If \code{"pmmm"}, elements are returned as a matrix where the # first row holds elements corresponding to PM probes and the second # corresponding to MM probes. Note that in this case, it is assumed # that there are equal number of PMs and MMs; if not, an error is # generated. # Moreover, the PMs and MMs may not even be paired, i.e. there is no # guarantee that the two elements in a column corresponds to a # PM-MM pair.} # \item{verbose}{An @integer specifying the verbose level. If 0, the # file is parsed quietly. The higher numbers, the more details.} # } # # \value{ # A named @list where the names corresponds to the names # of the units read. Each unit element of the list is in turn a # @list structure with one element \code{groups} which in turn # is a @list. Each group element in \code{groups} is a @list # with a single field named \code{indices}. Thus, the structure is # \preformatted{ # cdf # +- unit #1 # | +- "groups" # | +- group #1 # | | +- "indices" # | | group #2 # | | +- "indices" # | . # | +- group #K # | +- "indices" # +- unit #2 # . # +- unit #J # } # # This is structure is compatible with what @see "readCdfUnits" returns. # # Note that these indices are \emph{one-based}. # } # # \section{Cell indices are one-based}{ # Note that in \pkg{affxparser} all \emph{cell indices} are by # convention \emph{one-based}, which is more convenient to work # with in \R. For more details on one-based indices, see # @see "2. Cell coordinates and cell indices". # } # # @author "HB" # # \seealso{ # @see "readCdfUnits". # } # # @keyword "file" # @keyword "IO" #*/######################################################################### readCdfCellIndices <- function(filename, units=NULL, stratifyBy=c("nothing", "pmmm", "pm", "mm"), verbose=0) { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Validate arguments # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Argument 'filename': filename <- file.path(dirname(filename), basename(filename)); if (!file.exists(filename)) stop("File not found: ", filename); # Argument 'units': if (is.null(units)) { } else if (is.numeric(units)) { units <- as.integer(units); if (any(units < 1)) stop("Argument 'units' contains non-positive indices."); } else { stop("Argument 'units' must be numeric or NULL: ", class(units)[1]); } # Argument 'verbose': if (length(verbose) != 1) stop("Argument 'verbose' must be a single integer."); verbose <- as.integer(verbose); if (!is.finite(verbose)) stop("Argument 'verbose' must be an integer: ", verbose); # Argument 'stratifyBy': stratifyBy <- match.arg(stratifyBy); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read the CDF file # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # UNSUPPORTED CASE? if (!is.null(units) && length(units) == 0L) { stop("readCdfCellIndices(..., units=integer(0)) is not supported.") } cdf <- .Call("R_affx_get_cdf_cell_indices", filename, units, verbose, PACKAGE="affxparser"); # Sanity check if (is.null(cdf)) { stop("Failed to read cell indices from CDF file: ", filename); } if (stratifyBy == "nothing") return(cdf); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Stratify by PM, MM, or PM & MM # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - isPm <- readCdfIsPm(filename, units=units); # Using .subset2() instead of "[["() to avoid dispatching overhead etc. if (stratifyBy == "pmmm") { dimnames <- list(c("pm", "mm"), NULL); for (uu in seq_along(cdf)) { # groups <- cdf[[uu]]$groups; groups <- .subset2(.subset2(cdf, uu), "groups"); ngroups <- length(groups); if (ngroups == 0) next; for (gg in 1:ngroups) { # group <- groups[[gg]]; group <- .subset2(groups, gg); # pm <- isPm[[uu]][[gg]]; pm <- .subset2(.subset2(isPm, uu), gg); idx <- 1:length(pm); mm <- idx[!pm]; # Note: which(!pm) is about 60% slower! /HB pm <- idx[pm]; npm <- length(pm); if (npm != length(mm)) { # This is not expected to happen, but just in case. stop("Number of PM and MM probes differ in probeset #", uu, ": ", length(pm), " != ", length(mm)); } pmmm <- matrix(c(pm, mm), nrow=2L, ncol=npm, byrow=TRUE); # dimnames(pmmm) <- dimnames; # Re-order cell elements according to PM/MM. dim <- c(2, npm); # value <- group[[1]][pmmm]; value <- .subset(.subset2(group, 1), pmmm); dim(value) <- dim; dimnames(value) <- dimnames; group[[1]] <- value; # group[["pmmm"]] <- pmmm; groups[[gg]] <- group; } # for (gg ...) cdf[[uu]]$groups <- groups; } # for (uu ...) } else if (stratifyBy == "pm") { for (uu in seq_along(cdf)) { # groups <- cdf[[uu]]$groups; groups <- .subset2(.subset2(cdf, uu), "groups"); ngroups <- length(groups); if (ngroups == 0) next; for (gg in 1:ngroups) { # group <- groups[[gg]]; group <- .subset2(groups, gg); ngroup <- length(group); if (ngroup == 0) next; pm <- .subset2(.subset2(isPm, uu), gg); pm <- (1:length(pm))[pm]; # Note: which(pm) is about 60% slower! for (kk in 1:ngroup) { group[[kk]] <- .subset(.subset2(group, kk), pm); } groups[[gg]] <- group; } # for (gg ...) cdf[[uu]]$groups <- groups; } # for (uu ...) } else if (stratifyBy == "mm") { for (uu in seq_along(cdf)) { # groups <- cdf[[uu]]$groups; groups <- .subset2(.subset2(cdf, uu), "groups"); ngroups <- length(groups); if (ngroups == 0) next; for (gg in 1:ngroups) { # group <- groups[[gg]]; group <- .subset2(groups, gg); ngroup <- length(group); if (ngroup == 0) next; pm <- .subset2(.subset2(isPm, uu), gg); mm <- (1:length(pm))[!pm]; # Note: which(!pm) is about 60% slower! for (kk in 1:ngroup) { group[[kk]] <- .subset(.subset2(group, kk), mm); } groups[[gg]] <- group; } # for (gg ...) cdf[[uu]]$groups <- groups; } # for (uu ...) } cdf; } # readCdfUnitInidices() ############################################################################ # HISTORY: # 2011-11-18 # o ROBUSTNESS: Added sanity check that the native code did not return NULL. # 2010-12-12 # o ROBUSTNESS: Replaces .Internal(matrix(...)) with matrix(). # In the upcoming R 2.13.0 matrix() has less overhead. # 2006-12-10 # o BUG FIX: Same stratifyBy="mm" bug here as in readCdfUnits(). # 2006-07-22 # o Making more use of .subset2(). # 2006-05-20 # o Rd fix: The \value{} was incorrect. # 2006-05-12 # o Removed argument 'flat'. Will not be used for a while. The intention # was to remove the redundant levels of "groups" and possibly also the # "indices" level. That would most likely speed up things a bit, but it # would also require that readCelUnits() understand this other format too. # 2006-04-01 # o Created, because it is very commonly used and is about 5 times faster # than using readCdfUnits(..., readIndices=TRUE). /HB ############################################################################