R/readCdfUnits.R
f1d6fcf0
 #########################################################################/**
 # @RdocFunction readCdfUnits
 #
 # @title "Reads units (probesets) from an Affymetrix CDF file"
 #
 # @synopsis 
 # 
 # \description{
 #  @get "title". Gets all or a subset of units (probesets).
 # }
 # 
 # \arguments{
 #  \item{filename}{The filename of the CDF file.}
 #  \item{units}{An @integer @vector of unit indices
 #    specifying which units to be read.  If @NULL, all units are read.}
 #  \item{readXY}{If @TRUE, cell row and column (x,y) coordinates are
 #     retrieved, otherwise not.}
 #  \item{readBases}{If @TRUE, cell P and T bases are retrieved, otherwise not.}
 #  \item{readExpos}{If @TRUE, cell "expos" values are retrieved, otherwise not.}
 #  \item{readType}{If @TRUE, unit types are retrieved, otherwise not.}
505779dd
 #  \item{readDirection}{If @TRUE, unit \emph{and} group directions are
 #    retrieved, otherwise not.}
f1d6fcf0
 #  \item{stratifyBy}{A @character string specifying which and how
 #    elements in group fields are returned.
 #    If \code{"nothing"}, elements are returned as is, i.e. as @vectors.
 #    If \code{"pm"}/\code{"mm"}, only elements corresponding to 
 #    perfect-match (PM) / mismatch (MM) probes are returned (as @vectors).
 #    If \code{"pmmm"}, elements are returned as a matrix where the
 #    first row holds elements corresponding to PM probes and the second
 #    corresponding to MM probes.  Note that in this case, it is assumed 
 #    that there are equal number of PMs and MMs; if not, an error is
 #    generated.  
 #    Moreover, the PMs and MMs may not even be paired, i.e. there is no 
 #    guarantee that the two elements in a column corresponds to a 
 #    PM-MM pair.}
 #  \item{readIndices}{If @TRUE, cell indices calculated from the row and 
 #    column (X,Y) coordinates are retrieved, otherwise not.}
 #  \item{verbose}{An @integer specifying the verbose level. If 0, the
 #    file is parsed quietly.  The higher numbers, the more details.}
 # }
 # 
 # \value{
 #  A named @list where the names corresponds to the names
 #  of the units read.  Each element of the list is in turn a
 #  @list structure with three components:
 #  \item{groups}{A @list with one component for each group 
 #   (also called block). The information on each group is a 
505779dd
 #   @list of up to seven components: \code{x}, \code{y}, 
 #   \code{pbase}, \code{tbase}, \code{expos}, \code{indices},
 #   and \code{direction}.
 #   All fields but the latter have the same number of values as 
 #   there are cells in the group.  The latter field has only
 #   one value indicating the direction for the whole group.
 #  }
f1d6fcf0
 #  \item{type}{An @integer specifying the type of the
 #    unit, where 1 is "expression", 2 is "genotyping", 3 is "CustomSeq", 
 #    and 4 "tag".}
 #  \item{direction}{An @integer specifying the direction
 #    of the unit, which defines if the probes are interrogating the sense
 #    or the anti-sense target, where 0 is "no direction", 1 is "sense", and
 #    2 is "anti-sense".}
 # }
 #
 # \author{
 #  James Bullard, \email{[email protected]} and Kasper
 #  Daniel Hansen, \email{[email protected]}.
 #  Modified by Henrik Bengtsson (\url{http://www.braju.com/R/}) to read
 #  any subset of units and/or subset of parameters, to stratify by PM/MM,
 #  and to return cell indices.d
 # }
 # 
 # @examples "../incl/readCdfUnits.Rex"
 # 
 # \seealso{
 #   @see "readCdfCellIndices".
 # }
 # 
 # \references{
 #   [1] Affymetrix Inc, Affymetrix GCOS 1.x compatible file formats,
 #       June 14, 2005.
 #       \url{http://www.affymetrix.com/support/developer/}
 # }
 #
 # @keyword "file"
 # @keyword "IO"
 #*/######################################################################### 
ddd48129
 readCdfUnits <- function(filename, units=NULL, readXY=TRUE, readBases=TRUE, readExpos=TRUE, readType=TRUE, readDirection=TRUE, stratifyBy=c("nothing", "pmmm", "pm", "mm"), readIndices=FALSE, verbose=0) {
024399a3
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
   # Validate arguments
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
   # Argument 'filename':
   filename <- file.path(dirname(filename), basename(filename));
   if (!file.exists(filename))
     stop("File not found: ", filename);
 
   # Argument 'units':
   if (is.null(units)) {
   } else if (is.numeric(units)) {
     units <- as.integer(units);
ddd48129
     if (any(units < 1))
       stop("Argument 'units' contains non-positive indices.");
024399a3
   } else {
     stop("Argument 'units' must be numeric or NULL: ", class(units)[1]);
   }
 
ddd48129
   # Argument 'verbose':
024399a3
   if (length(verbose) != 1)
2a5f6145
     stop("Argument 'verbose' must be a single integer.");
024399a3
   verbose <- as.integer(verbose);
   if (!is.finite(verbose))
2a5f6145
     stop("Argument 'verbose' must be an integer: ", verbose);
 
 
024399a3
 
a5db4cd3
   # Argument 'readXY':
   readXY <- as.integer(as.logical(readXY));
 
   # Argument 'readBases':
   readBases <- as.integer(as.logical(readBases));
 
   # Argument 'readExpos':
   readExpos <- as.integer(as.logical(readExpos));
 
   # Argument 'readType':
   readType <- as.integer(as.logical(readType));
 
   # Argument 'readDirection':
   readDirection <- as.integer(as.logical(readDirection));
 
ad4b2701
   # Argument 'stratifyBy':
   stratifyBy <- match.arg(stratifyBy);
 
ddd48129
   # Argument 'readIndices':
   readIndices <- as.integer(as.logical(readIndices));
 
a5db4cd3
 
024399a3
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
   # Read the CDF file
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
a5db4cd3
   cdf <- .Call("R_affx_get_cdf_units", filename, units, 
ddd48129
                 readXY, readBases, readExpos, 
                 readType, readDirection, 
                 readIndices, 
                 verbose, PACKAGE="affxparser");
 
ad4b2701
   if (stratifyBy == "nothing")
     return(cdf);
 
024399a3
 
ad4b2701
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
   # Stratify by PM, MM, or PM & MM
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
   isPm <- readCdfIsPm(filename, units=units);
 
ddd48129
   # Using .subset2() instead of "[["() to avoid dispatching overhead etc.
ad4b2701
   if (stratifyBy == "pmmm") {
     dimnames <- list(c("pm", "mm"), NULL);
a5db4cd3
 
     for (uu in seq(along=cdf)) {
ddd48129
 #      groups <- cdf[[uu]]$groups;
       groups <- .subset2(.subset2(cdf, uu), "groups");
       ngroups <- length(groups);
       if (ngroups == 0)
         next;
 
       for (gg in 1:ngroups) {
 #        group <- groups[[gg]];
         group <- .subset2(groups, gg);
 #        pm <- isPm[[uu]][[gg]];
         pm <- .subset2(.subset2(isPm, uu), gg);
         idx <- 1:length(pm);
         mm <- idx[!pm];  # Note: which(!pm) is about 60% slower! /HB
         pm <- idx[pm];
ad4b2701
         npm <- length(pm);
         if (npm != length(mm)) {
a5db4cd3
           # This is not expected to happen, but just in case.
           stop("Number of PM and MM probes differ in probeset #", uu,
                                      ": ", length(pm), " != ", length(mm));
         }
110eebc7
         pmmm <- matrix(c(pm, mm), nrow=2L, ncol=npm, byrow=TRUE);
ddd48129
 #        dimnames(pmmm) <- dimnames;
a5db4cd3
 
ad4b2701
         # Re-order cell elements according to PM/MM.
ddd48129
         ngroup <- length(group);
         if (ngroup > 0) {
           dim <- c(2, npm);
           for (kk in 1:ngroup) {
   #          value <- group[[kk]][pmmm];
             value <- .subset(.subset2(group, kk), pmmm);
             dim(value) <- dim; 
             dimnames(value) <- dimnames;
             group[[kk]] <- value;
           }
a5db4cd3
         }
 
ddd48129
  #       group[["pmmm"]] <- pmmm;
ad4b2701
         groups[[gg]] <- group;
       } # for (gg ...)
       cdf[[uu]]$groups <- groups;
     } # for (uu ...)
   } else if (stratifyBy == "pm") {
     for (uu in seq(along=cdf)) {
       groups <- cdf[[uu]]$groups;
ddd48129
       ngroups <- length(groups);
       if (ngroups == 0)
         next;
 
       for (gg in 1:ngroups) {
ad4b2701
         group <- groups[[gg]];
ddd48129
         ngroup <- length(group);
         if (ngroup == 0)
           next;
 
         pm <- .subset2(.subset2(isPm, uu), gg);
         pm <- (1:length(pm))[pm]; # Note: which(!pm) is about 60% slower!
         for (kk in 1:ngroup) {
           group[[kk]] <- .subset(.subset2(group, kk), pm);
ad4b2701
         }
         groups[[gg]] <- group;
       } # for (gg ...)
       cdf[[uu]]$groups <- groups;
     } # for (uu ...)
   } else if (stratifyBy == "mm") {
     for (uu in seq(along=cdf)) {
       groups <- cdf[[uu]]$groups;
ddd48129
       ngroups <- length(groups);
       if (ngroups == 0)
         next;
 
       for (gg in 1:ngroups) {
ad4b2701
         group <- groups[[gg]];
ddd48129
         ngroup <- length(group);
         if (ngroup == 0)
           next;
 
e740c002
         pm <- .subset2(.subset2(isPm, uu), gg);
ddd48129
         mm <- (1:length(pm))[!pm]; # Note: which(!pm) is about 60% slower!
         for (kk in 1:ngroup) {
           group[[kk]] <- .subset(.subset2(group, kk), mm);
ad4b2701
         }
a5db4cd3
         groups[[gg]] <- group;
       } # for (gg ...)
       cdf[[uu]]$groups <- groups;
     } # for (uu ...)
   }
 
   cdf;
ddd48129
 } # readCdfUnits()
ad4b2701
 
 
 ############################################################################
 # HISTORY:
110eebc7
 # 2010-12-12
 # o ROBUSTNESS: Replaces .Internal(matrix(...)) with matrix().
 #   In the upcoming R 2.13.0 matrix() has less overhead.
505779dd
 # 2006-12-30
 # o Now 'readDirection=TRUE' also return group directions.
ddd48129
 # 2006-03-28
 # o Unit indices are now one-based. /HB
 # o Renamed argument 'readCells' to 'readIndices'. /HB
 # 2006-03-24
 # o Not returning 'pmmm' field anymore.  A bit faster an smaller object.
 # o Speed improvement of the "stratifyBy" code.  Instead of using which()
 #   one can do the same oneself, which is 50% faster.  In addition, I have
 #   replaced "[[" and "[" with .subset2() and .subset().
ad4b2701
 # 2006-02-21
 # o Added argument 'readCells' to speed up the calculation of cell indices
 #   from (x,y), i.e. cell = y * ncol + x.
 # o Replaced argument 'splitPmMm' with 'stratifyBy'.  This will speed up
 #   things down the stream.
 # 2006-01-16
 # o Added argument 'splitPmMm'. /HB
 # 2006-01-09
 # o Note that ../man/readCdfUnits.R in generated from the Rdoc comments
 #   above.  See the R.oo package for details.  Don't remove the *.Rex files!
 # o Created by HB.  The purpose was to make it possible to read subsets
 #   of units and not just all units at once.  /HB
 ############################################################################