R/updateCel.R
f1e18a54
 #########################################################################/**
 # @RdocFunction updateCel
 #
 # @title "Updates a CEL file"
 #
95092fca
 # @synopsis
 #
f1e18a54
 # \description{
 #   @get "title".
 # }
95092fca
 #
f1e18a54
 # \arguments{
 #   \item{filename}{The filename of the CEL file.}
95092fca
 #   \item{indices}{A @numeric @vector of cell (probe) indices specifying
8b5e38bd
 #     which cells to updated.  If @NULL, all indices are considered.}
f1e18a54
 #   \item{intensities}{A @numeric @vector of intensity values to be stored.
 #     Alternatively, it can also be a named @data.frame or @matrix (or @list)
 #     where the named columns (elements) are the fields to be updated.}
 #   \item{stdvs}{A optional @numeric @vector.}
 #   \item{pixels}{A optional @numeric @vector.}
9d93fc41
 #   \item{writeMap}{An optional write map.}
f1e18a54
 #   \item{...}{Not used.}
 #   \item{verbose}{An @integer specifying how much verbose details are
 #     outputted.}
 # }
95092fca
 #
f1e18a54
 # \value{
 #   Returns (invisibly) the pathname of the file updated.
 # }
 #
 # \details{
 #   Currently only binary (v4) CEL files are supported.
 #   The current version of the method does not make use of the Fusion SDK,
 #   but its own code to navigate and update the CEL file.
 # }
 #
 # @examples "../incl/updateCel.Rex"
 #
76cf4b26
 # @author "HB"
95092fca
 #
f1e18a54
 # @keyword "file"
 # @keyword "IO"
 #*/#########################################################################
63b4c964
 updateCel <- function(filename, indices=NULL, intensities=NULL, stdvs=NULL, pixels=NULL, writeMap=NULL, ..., verbose=0) {
95092fca
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
f1e18a54
   # Validate arguments
95092fca
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
f1e18a54
   # Argument 'filename':
   if (!file.exists(filename)) {
     stop("Cannot update CEL file. File not found: ", filename);
   }
 
   header <- readCelHeader(filename);
   version <- header$version;
   if (!version %in% 4) {
     stop("Updating CEL v", version, " files is not supported: ", filename);
   }
 
   nbrOfCells <- header$total;
 
   # Argument 'indices':
   if (is.null(indices)) {
     nbrOfIndices <- nbrOfCells;
   } else {
     # A CEL file has one-based indices
63b4c964
     r <- range(indices);
     if (r[1] < 1 || r[2] > nbrOfCells) {
       stop("Argument 'indices' is out of range [1,", nbrOfCells, "]: ",
            "[", r[1], ",", r[2], "]");
     }
f1e18a54
     nbrOfIndices <- length(indices);
   }
 
   # Argument 'intensities':
   if (is.matrix(intensities)) {
     intensities <- as.data.frame(intensities);
   }
 
   if (is.list(intensities) || is.data.frame(intensities)) {
     if (is.list(intensities)) {
       fields <- names(intensities);
     } else {
       fields <- colnames(intensities);
     }
 
     if (is.null(stdvs) && ("stdvs" %in% fields)) {
       stdvs <- intensities[["stdvs"]];
     }
 
     if (is.null(pixels) && ("pixels" %in% fields)) {
       pixels <- intensities[["pixels"]];
     }
 
     if ("intensities" %in% fields) {
       intensities <- intensities[["intensities"]];
     }
   }
 
   # Argument 'intensities':
   if (!is.null(intensities)) {
f33805a7
     if (!is.double(intensities))
       intensities <- as.double(intensities);
f1e18a54
     if (length(intensities) != nbrOfIndices) {
       stop("Number of 'intensities' values does not match the number of cell indices: ", length(intensities), " != ", nbrOfIndices);
     }
   }
 
   # Argument 'stdvs':
   if (!is.null(stdvs)) {
f33805a7
     if (!is.double(stdvs))
       stdvs <- as.double(stdvs);
f1e18a54
     if (length(stdvs) != nbrOfIndices) {
       stop("Number of 'stdvs' values does not match the number of cell indices: ", length(stdvs), " != ", nbrOfIndices);
     }
   }
 
   # Argument 'pixels':
   if (!is.null(pixels)) {
f33805a7
     if (!is.integer(pixels))
       pixels <- as.integer(pixels);
f1e18a54
     if (length(pixels) != nbrOfIndices) {
       stop("Number of 'pixels' values does not match the number of cell indices: ", length(pixels), " != ", nbrOfIndices);
     }
   }
 
63b4c964
   # Argument 'writeMap':
   if (!is.null(writeMap)) {
     writeMap <- .assertMap(writeMap, nbrOfCells);
   }
 
f1e18a54
   # Argument 'verbose':
   if (length(verbose) != 1)
2a5f6145
     stop("Argument 'verbose' must be a single integer.");
f1e18a54
   verbose <- as.integer(verbose);
   if (!is.finite(verbose))
2a5f6145
     stop("Argument 'verbose' must be an integer: ", verbose);
f1e18a54
 
   # Nothing to do?
   if (nbrOfIndices == 0) {
     return(invisible(filename));
   }
 
95092fca
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
f1e18a54
   # Reorder data such that it is written in optimal order
95092fca
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
63b4c964
   reorder <- TRUE;
8b5e38bd
   if (is.null(indices)) {
63b4c964
     # Has write map?
     if (is.null(writeMap)) {
       indices <- 1:nbrOfIndices;
       reorder <- FALSE;
     } else {
       indices <- writeMap;
     }
   }
 
   if (reorder) {
f33805a7
     if (verbose >= 2)
       cat("Re-ordering data for optimal write order...");
8b5e38bd
     o <- order(indices);
     indices <- indices[o];
     if (!is.null(intensities))
       intensities <- intensities[o];
     if (!is.null(stdvs))
       stdvs <- stdvs[o];
     if (!is.null(pixels))
       pixels <- pixels[o];
95092fca
     o <- NULL; # Not needed anymore
f33805a7
     if (verbose >= 2)
       cat("done.\n");
8b5e38bd
   }
f1e18a54
 
95092fca
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
f1e18a54
   # Write data to file
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
   version <- header$version;
   if (version == 4) {
     # Open CEL file
     con <- file(filename, open="r+b");
     on.exit(close(con));
 
     # Skip CEL header
f33805a7
     if (verbose >= 2)
       cat("Skipping to beginging of data section...");
5f7aff5e
     h <- .readCelHeaderV4(con);
f1e18a54
 
95092fca
     # "Cell entries - this consists of an intensity value, standard
f1e18a54
     #  deviation value and pixel count for each cell in the array.
95092fca
     #  The values are stored by row then column starting with the X=0,
     #  Y=0 cell. As an example, the first five entries are for cells
f1e18a54
     #  defined by XY coordinates: (0,0), (1,0), (2,0), (3,0), (4,0).
     #  Type: (float, float, short) = 4 + 4 + 2 = 10 bytes / cell
     #  cellData <- c(readFloat(con), readFloat(con), readShort(con));
8b5e38bd
     sizeOfCell <- 10;
 
f1e18a54
     # Current file position
c448bd63
     dataOffset <- seek(con, origin="start", rw="read");
5f7aff5e
 
f33805a7
     if (verbose >= 2)
       cat("done.\n");
f1e18a54
 
8b5e38bd
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
f33805a7
     # Update in chunks
8b5e38bd
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
a92acde3
     CHUNK.SIZE <- 2^19; # = 524288 indices == 5.2Mb
c448bd63
     CHUNK.SIZE <- 2^20; # = 1048576 indices == 10.5Mb
8b5e38bd
 
f33805a7
     # Work with zero-based indices
     indices <- indices - 1;
8b5e38bd
 
f33805a7
     count <- 1;
     offset <- dataOffset;
ede2c2f5
     nbrOfChunks <- ceiling(length(indices) / CHUNK.SIZE);
f33805a7
     while (length(indices) > 0) {
       if (verbose >= 1) {
ede2c2f5
         cat(sprintf("Number of indices left: %d\n", length(indices)));
         cat(sprintf("Updating chunk #%d of %d...\n", count, nbrOfChunks));
f33805a7
       }
f1e18a54
 
f33805a7
       # Recall: All indices are ordered!
f1e18a54
 
f33805a7
       # Shift offset to the first index.
       firstIndex <- indices[1];
       offset <- offset + sizeOfCell*firstIndex;
f1e18a54
 
ddf7b0da
       # Shift indices such that first index is zero.
f33805a7
       indices <- indices - firstIndex;
8b5e38bd
 
f33805a7
       # Get largest index
       maxIndex <- indices[length(indices)];
a92acde3
 #      verbose && cat(verbose, "Largest index: ", maxIndex, "\n");
8b5e38bd
 
f33805a7
       # Identify the indices to update such no more than CHUNK.SIZE cells
       # are read/updated.
       n <- which.max(indices >= CHUNK.SIZE);
       if (n == 1)
a92acde3
         n <- length(indices);
f33805a7
 
       subset <- 1:n;
 
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       # Read the data section of the CEL file
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       if (verbose >= 1)
         cat("Reading chunk data section...");
 
c448bd63
       seek(con, origin="start", where=offset, rw="read");
a92acde3
       rawAll <- readBin(con=con, what="raw", n=sizeOfCell*(indices[n]+1));
f33805a7
       if (verbose >= 1)
         cat("done.\n");
       # Common to all fields
       raw <- NULL;
95092fca
 
f33805a7
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       # Update 'intensities'
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       if (!is.null(intensities)) {
         if (verbose >= 1)
           cat("Updating 'intensities'...");
         # Write floats (size=4) to raw vector
         raw <- raw(length=4*n);
         raw <- writeBin(con=raw, intensities[subset], size=4, endian="little");
         intensities <- intensities[-subset]; # Not needed anymore
 
         # Updated 'rawAll' accordingly
         idx <- rep(sizeOfCell*indices[subset], each=4) + 1:4;
         rawAll[idx] <- raw;
95092fca
         idx <- NULL; # Not needed anymore
f33805a7
         if (verbose >= 1)
           cat("done.\n");
       }
95092fca
 
f33805a7
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       # Update 'stdvs'
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       if (!is.null(stdvs)) {
         if (verbose >= 1)
           cat("Updating 'stdvs'...");
         # Write floats (size=4) to raw vector
         if (length(raw) != 4*n)
           raw <- raw(length=4*n);
         raw <- writeBin(con=raw, stdvs[subset], size=4, endian="little");
         stdvs <- stdvs[-subset]; # Not needed anymore
95092fca
 
f33805a7
         # Updated 'rawAll' accordingly
         idx <- rep(sizeOfCell*indices[subset], each=4) + 5:8;
         rawAll[idx] <- raw;
95092fca
         idx <- NULL; # Not needed anymore
f33805a7
         if (verbose >= 1)
           cat("done.\n");
       }
95092fca
 
f33805a7
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       # Update 'pixels'
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       if (!is.null(pixels)) {
95092fca
         raw <- NULL; # Not needed anymore
f33805a7
         if (verbose >= 1)
           cat("Updating 'pixels'...");
         # Write short integers (size=2) to raw vector
         raw <- raw(length=2*n);
         raw <- writeBin(con=raw, pixels[subset], size=2, endian="little");
         pixels <- pixels[-subset]; # Not needed anymore
95092fca
 
f33805a7
         # Updated 'rawAll' accordingly
         idx <- rep(sizeOfCell*indices[subset], each=2) + 9:10;
         rawAll[idx] <- raw;
95092fca
         idx <- NULL; # Not needed anymore
f33805a7
         if (verbose >= 1)
           cat("done.\n");
       }
95092fca
       raw <- NULL; # Not needed anymore
a92acde3
 
f33805a7
       # Remove updated indices
       indices <- indices[-subset];
95092fca
       subset <- NULL; # Not needed anymore
f33805a7
 
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       # Write raw data back to file
       # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
       if (verbose >= 1)
         cat("Writing chunk to data section...");
c448bd63
       seek(con, origin="start", where=offset, rw="write");
f33805a7
       writeBin(con=con, rawAll);
       if (verbose >= 1)
         cat("done.\n");
 
95092fca
       rawAll <- NULL; # Not needed anymore
f33805a7
 
       if (verbose >= 1)
ede2c2f5
         cat(sprintf("Updating chunk #%d of %d...done\n", count, nbrOfChunks));
f33805a7
       count <- count + 1;
     } # while (...)
f1e18a54
   } # if (version ...)
 
   invisible(filename);
 }
 
 
 ############################################################################
 # HISTORY:
63b4c964
 # 2007-01-04
 # o Added argument 'writeMap'.
c448bd63
 # 2006-08-19
 # o BUG FIX: Wow wow wow.  This one was tricky to find.  If not specifying
 #   the 'rw' argument in seek() it defaults to "", which is not "read" as
 #   I naively though (because I did not read the inner details of ?seek),
 #   but the latest call to seek.  In other words, since I at the end of
 #   every "chunk" loop call seek(..., rw="write") the seek(..., [rw=""])
 #   was equal to a seek(..., rw="write"), but I wanted seek(..., rw="read")!
 #   That made updateCel() do funny things and write to the wrongs parts
 #   of the file etc.
a92acde3
 # 2006-08-18
95092fca
 # o BUG FIX: The new implementation where data was written to raw vectors
a92acde3
 #   was incorrect.  A lot of extra zeros was written.  The "eastern egg" in
 #   the updated example contains an image from http://tpo.berkeley.edu/.
 # 2006-08-14
95092fca
 # o BUG FIX: updateCel() would in some cases give "Error: subscript out of
a92acde3
 #   bounds" when writing the last chunk.
f33805a7
 # 2006-07-22
95092fca
 # o Update updateCel() to update data in chunks, because updating the
f33805a7
 #   complete data section is expensive.  For example, a 500K chip has
 #   6553600 cells each of size 10 bytes, i.e. >65Mb or raw memory.  With
 #   copying etc it costs >100-200Mb of memory to update a CEL file if only
 #   the first *and* the last cell is updated.  Now it should only be of
 #   the order of 10-20Mb per chunk.
 # o Added verbose output to updateCel().
 # o Now updateCel() deallocates objects as soon as possible in order to
 #   free up as much memory as possible. Had memory problems with the 500K's.
8b5e38bd
 # 2006-07-21
 # o updateCel() was really slow when updating a large number of cells.
 #   Now the idea is to write to raw vectors stored in memory.  By reading
 #   the chunk of the CEL data section that is going to be updated as a raw
 #   data vector and then updating this in memory first, and the re-write
 #   that chuck of raw data to file, things are much faster.
 # o BUG FIX: updateCel(..., indices=NULL) would generate an error, because
 #   we tried to reorder by order(indices).
f1e18a54
 # 2006-06-19
 # o Replace 'data' argument with arguments 'intensities', 'stdvs', and
 #   'pixels'. /HB
 # 2006-06-18
95092fca
 # o First version can update CEL v4 (binary) cell entries.  Note that this
 #   code does not make use of the Fusion SDK library.  This may updated in
f1e18a54
 #   the future, but for now we just want something that works.
 # o Created. /HB
95092fca
 ############################################################################