#########################################################################/** # @RdocFunction readCcg # # @title "Reads an Affymetrix Command Console Generic (CCG) Data file" # # @synopsis # # \description{ # @get "title". The CCG data file format is also known as the # Calvin file format. # } # # \arguments{ # \item{pathname}{The pathname of the CCG file.} # \item{verbose}{An @integer specifying the verbose level. If 0, the # file is parsed quietly. The higher numbers, the more details.} # \item{.filter}{A @list.} # \item{...}{Not used.} # } # # \value{ # A named @list structure consisting of ... # } # # \details{ # Note, the current implementation of this methods does not utilize the # Affymetrix Fusion SDK library. Instead, it is implemented in R from the # file format definition [1]. # } # # \section{About the CCG file format}{ # A CCG file, consists of a "file header", a "generic data header", # and "data" section, as outlined here: # \itemize{ # \item File Header # \item Generic Data Header (for the file) # \enumerate{ # \item Generic Data Header (for the files 1st parent) # \enumerate{ # \item Generic Data Header (for the files 1st parents 1st parent) # \item Generic Data Header (for the files 1st parents 2nd parent) # \item ... # \item Generic Data Header (for the files 1st parents Mth parent) # } # \item Generic Data Header (for the files 2nd parent) # \item ... # \item Generic Data Header (for the files Nth parent) # } # \item Data # \enumerate{ # \item Data Group #1 # \enumerate{ # \item Data Set #1 # \itemize{ # \item Parameters # \item Column definitions # \item Matrix of data # } # \item Data Set #2 # \item ... # \item Data Set #L # } # \item Data Group #2 # \item ... # \item Data Group #K # } # } # } # # @author "HB" # # \seealso{ # @see "readCcgHeader". # @see "readCdfUnits". # } # # \references{ # [1] Affymetrix Inc, Affymetrix GCOS 1.x compatible file formats, # April, 2006. # \url{http://www.affymetrix.com/support/developer/}\cr # } # # @keyword "file" # @keyword "IO" #*/######################################################################### readCcg <- function(pathname, verbose=0, .filter=NULL, ...) { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Validate arguments # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Argument '.filter': hasFilter <- FALSE; if (!is.null(.filter)) { if (!is.list(.filter)) { stop("Argument '.filter' must be a list: ", mode(.filter)); } hasFilter <- TRUE; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Open file # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - con <- file(pathname, open="rb"); on.exit(close(con)); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Allocate return structure # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ccg <- list(); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read file header # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - fhdr <- .readCcgFileHeader(con); if (hasFilter) { if (!identical(.filter$header, FALSE)) ccg$fileHeader <- fhdr; } else { ccg$fileHeader <- fhdr; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read the data header # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ccg$genericDataHeader <- .readCcgDataHeader(con, .filter=.filter$dataHeader); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read the data # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - dataGroups <- .readCcgDataGroups(con, .filter=.filter$data, .fileHeader=fhdr); if (hasFilter) { if (!identical(.filter$dataGroups, FALSE)) ccg$dataGroups <- dataGroups; } else { ccg$dataGroups <- dataGroups; } ccg; } # readCcg() .readCcgDataGroups <- function(pathname, .filter=NULL, .fileHeader=NULL, ...) { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Validate arguments # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Argument 'pathname': if (inherits(pathname, "connection")) { con <- pathname; } else { if (!file.exists(pathname)) stop("File not found: ", pathname); con <- file(pathname, open="rb"); on.exit(close(con)); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read file header? # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if (is.null(.fileHeader)) { .fileHeader <- .readCcgFileHeader(con); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read data groups # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - currFilter <- .filter; nextDataGroupStart <- .fileHeader$dataGroupStart; dataGroups <- list(); for (gg in seq_len(.fileHeader$nbrOfDataGroups)) { dataGroupHeader <- .readCcgDataGroupHeader(con, fileOffset=nextDataGroupStart); # Next data group nextDataGroupStart <- dataGroupHeader$nextGroupStart; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Apply filter # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if (!is.null(.filter)) { # currFilter <- NULL; # if (is.null(names(.filter))) { # currFilter <- .filter[[gg]]; # } else { # pos <- match(dataGroupHeader$name, names(.filter)); # if (length(pos) > 0) # currFilter <- .filter[[pos]]; # } # } # str(currFilter); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read data sets # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - offset <- dataGroupHeader$dataSetStart; dss <- vector("list", dataGroupHeader$nbrOfDataSets); names <- character(dataGroupHeader$nbrOfDataSets); for (kk in seq_along(dss)) { ds <- .readCcgDataSet(con, fileOffset=offset); offset <- ds$nextDataSetStart; dss[[kk]] <- ds; names[kk] <- ds$name; }; names(dss) <- names; dataGroup <- list( header = dataGroupHeader, dataSets = dss ); dataGroups <- c(dataGroups, list(dataGroup)); } # while (nextDataGroupStart != 0) names(dataGroups) <- unlist(lapply(dataGroups, FUN=function(dg) { dg$header$name }), use.names=FALSE); dataGroups; } # .readCcgDataGroups() .readCcgDataGroupHeader <- function(con, fileOffset=NULL, ...) { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Local functions # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rawToString <- function(raw, ...) { # This approach drops all '\0', in order to avoid warnings # in rawToChar(). Note, it does not truncate the string after # the first '\0'. However, such strings should never occur in # the first place. raw <- raw[raw != as.raw(0)]; rawToChar(raw); } readInt <- function(con, n=1, ...) { readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n); } readUInt <- function(con, n=1, ...) { # NOTE: Ideally we would use signed=FALSE here, but there is no # integer data type in R that can hold 4-byte unsigned integers. # Because of this limitation, readBin() will give a warning that # signed=FALSE only works for size=1 or 2. # WORKAROUND: Use signed=TRUE and assume there are no values # greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15 readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n); } readWString <- function(con, ...) { nchars <- readInt(con); if (nchars == 0) return(""); bfr <- readBin(con, what=raw(), n=2*nchars); bfr <- bfr[seq(from=2, to=length(bfr), by=2)]; rawToString(bfr); } readRaw <- function(con, ...) { n <- readInt(con); if (n == 0) return(raw(0)); readBin(con, what=raw(0), n=n); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if (!is.null(fileOffset)) { seek(con=con, where=fileOffset, offset="start", rw="read"); } # Data Group # This section describes the data group. A data group is a group # of data sets. The file supports one or more data groups in a file. # # Item Description Type # 1 File position of the next data group. When this is the last # data group in the file, the value should be 0. UINT # 2 File position of the first data set within the data group. UINT # 3 The number of data sets within the data group. INT # 4 The data group name. WSTRING nextGroupStart=readUInt(con) dataSetStart=readUInt(con) nbrOfDataSets=readInt(con) name=readWString(con) dataGroupHeader <- list( nextGroupStart=nextGroupStart, dataSetStart=dataSetStart, nbrOfDataSets=nbrOfDataSets, name=name ) dataGroupHeader; } # .readCcgDataGroupHeader() .readCcgDataSet <- function(con, fileOffset=NULL, ...) { # Value Types # The following table defines the numeric values for the value types. # The value type is used to representing the type of value stored in # the file. # # Value Type # 0 BYTE # 1 UBYTE # 2 SHORT # 3 USHORT # 4 INT # 5 UINT # 6 FLOAT # 7 STRING # 8 WSTRING whats <- c("integer", "integer", "integer", "integer", "integer", "integer", "double", "character", "character"); names(whats) <- c("BYTE", "UBYTE", "SHORT", "USHORT", "INT", "UINT", "FLOAT", "STRING", "WSTRING"); signeds <- c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE); sizes <- c(1, 1, 2, 2, 4, 4, 4, 1, 2); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Local functions # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - rawToString <- function(raw, ...) { # This approach drops all '\0', in order to avoid warnings # in rawToChar(). Note, it does not truncate the string after # the first '\0'. However, such strings should never occur in # the first place. raw <- raw[raw != as.raw(0)]; rawToChar(raw); } readByte <- function(con, n=1, ...) { readBin(con, what=integer(), size=1, signed=TRUE, endian="big", n=n); } readInt <- function(con, n=1, ...) { readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n); } readUInt <- function(con, n=1, ...) { # NOTE: Ideally we would use signed=FALSE here, but there is no # integer data type in R that can hold 4-byte unsigned integers. # Because of this limitation, readBin() will give a warning that # signed=FALSE only works for size=1 or 2. # WORKAROUND: Use signed=TRUE and assume there are no values # greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15 readBin(con, what=integer(), size=4, signed=TRUE, endian="big", n=n); } readString <- function(con, ...) { nchars <- readInt(con); if (nchars == 0) return(""); readChar(con, nchars=nchars); } readWString <- function(con, ...) { nchars <- readInt(con); if (nchars == 0) return(""); bfr <- readBin(con, what=raw(), n=2*nchars); bfr <- bfr[seq(from=2, to=length(bfr), by=2)]; rawToString(bfr); } readRaw <- function(con, ...) { n <- readInt(con); if (n == 0) return(raw(0)); readBin(con, what=raw(0), n=n); } readWVT <- function(con, ...) { name <- readWString(con); raw <- readRaw(con); type <- readWString(con); # Update data types # * text/x-calvin-integer-8 # * text/x-calvin-unsigned-integer-8 # * text/x-calvin-integer-16 # * text/x-calvin-unsigned-integer-16 # * text/x-calvin-integer-32 # * text/x-calvin-unsigned-integer-32 # * text/x-calvin-float # * text/plain n <- length(raw); # cat(sprintf("Reading n=%d records of type '%s' named '%s'.\n", n, type, name)); value <- switch(type, "text/ascii" = { rawToString(raw); }, "text/plain" = { # Unicode/UTF-16?!? raw <- matrix(raw, ncol=2, byrow=TRUE); raw <- raw[,2]; rawToString(raw); }, "text/x-calvin-integer-8" = { readBin(raw, what=integer(0), endian="big", size=1, signed=TRUE, n=n); }, "text/x-calvin-unsigned-integer-8" = { readBin(raw, what=integer(0), endian="big", size=1, signed=FALSE, n=n); }, "text/x-calvin-integer-16" = { readBin(raw, what=integer(0), endian="big", size=2, signed=TRUE, n=n); }, "text/x-calvin-unsigned-integer-16" = { readBin(raw, what=integer(0), endian="big", size=2, signed=FALSE, n=n); }, "text/x-calvin-integer-32" = { readBin(raw, what=integer(0), endian="big", size=4, signed=TRUE, n=n); }, "text/x-calvin-unsigned-integer-32" = { # NOTE: Ideally we would use signed=FALSE here, but there is no # integer data type in R that can hold 4-byte unsigned integers. # Because of this limitation, readBin() will give a warning that # signed=FALSE only works for size=1 or 2. # WORKAROUND: Use signed=TRUE and assume there are no values # greater that .Machine$integer.max == 2^31-1. /HB 2015-04-15 readBin(raw, what=integer(0), endian="big", size=4, signed=TRUE, n=n); }, "text/x-calvin-float" = { readBin(raw, what=double(0), endian="big", size=4, n=n); }, { raw; } ) # switch() list(name=name, value=value, raw=raw, type=type); } # readWVT() readWBI <- function(con, ...) { list(name=readWString(con), type=readByte(con), size=readInt(con)); } if (!is.null(fileOffset)) { seek(con=con, where=fileOffset, offset="start", rw="read"); } # Data Set # This section describes the data for a single data set item # (probe set, sequence, allele, etc.). The file supports one # or more data sets within a data group. # # Item Description Type # 1 The file position of the first data element in the data set. # This is the first byte after the data set header. UINT # 2 The file position of the next data set within the data group. # When this is the last data set in the data group the value # shall be 1 byte past the end of the data set. This way the size # of the data set may be determined. UINT # 3 The data set name. WSTRING # 4 The number of name/value/type parameters. INT # 5 Array of name/value/type parameters. (WSTRING / VALUE / TYPE) [ ] # 6 Number of columns in the data set. # Example: For expression arrays, columns may include signal, p-value, # detection call and for genotyping arrays columns may include allele # call, and confidence value. For universal arrays, columns may # include probe set intensities and background. UINT # 7 An array of column names, column value types and column type sizes # (one per column). # The value type shall be represented by the value from the value type # table. The size shall be the size of the type in bytes. For strings, # this value shall be the size of the string in bytes plus 4 bytes for # the string length written before the string in the file. # (WSTRING / BYTE / INT) [ ] # 8 The number of rows in the data set. UINT # 9 The data set table, consisting of rows of columns (data values). # The specific type and size of each column is described by the data # and size types above. ROW [ ] dataSet <- list( elementsStart=readUInt(con), nextDataSetStart=readUInt(con), name=readWString(con) ) # Reading parameters nbrOfParams <- readInt(con); params <- vector("list", nbrOfParams); names <- character(nbrOfParams); for (kk in seq_len(nbrOfParams)) { wvt <- readWVT(con); names[kk] <- wvt$name; value <- wvt$value; attr(value, "mimeType") <- wvt$type; params[[kk]] <- value; } names(params) <- names; dataSet$parameters <- params; # Reading columns nbrOfColumns <- readUInt(con); columns <- vector("list", nbrOfColumns); names <- character(nbrOfColumns); colWhats <- vector("list", nbrOfColumns); bytesPerRow <- 0; for (cc in seq_len(nbrOfColumns)) { wbi <- readWBI(con); names[cc] <- wbi$name; what <- whats[wbi$type+1]; signed <- signeds[wbi$type+1]; size <- wbi$size; bytesPerRow <- bytesPerRow + size; attr(what, "name") <- names(whats)[wbi$type+1]; attr(what, "signed") <- signed; attr(what, "size") <- size; colWhats[[cc]] <- what; } names(colWhats) <- names; bytesPerRow <- as.integer(bytesPerRow); nbrOfRows <- readUInt(con); totalNbrOfBytes <- nbrOfRows * bytesPerRow; # Skip to the first element seek(con, which=dataSet$elementsStart, offset="start", rw="read"); # Read all data row by row raw <- readBin(con, what=raw(), n=totalNbrOfBytes); dim(raw) <- c(bytesPerRow, nbrOfRows); table <- vector("list", nbrOfColumns); colsOffset <- 0; for (cc in seq_len(nbrOfColumns)) { what <- colWhats[[cc]]; signed <- attr(what, "signed"); size <- attr(what, "size"); if (what == "character") { value <- matrix(raw[1:4,], nrow=nbrOfRows, ncol=4); raw <- raw[-c(1:4),,drop=FALSE]; # Get the number of characters per string (all equal) ## nchars <- readInt(con=value, n=nbrOfRows); ## nchars <- nchars[1]; nchars <- readInt(con=value, n=1); value <- NULL; # Not needed anymore ccs <- 1:(size-4); value <- raw[ccs,]; raw <- raw[-ccs,,drop=FALSE]; value <- rawToChar(value, multiple=TRUE); dim(value) <- c(nchars, nbrOfRows); # Build strings using vectorization (not apply()!) strs <- NULL; for (pp in seq_len(nrow(value))) { valuePP <- value[1,,drop=FALSE]; value <- value[-1,,drop=FALSE]; if (pp == 1) { strs <- valuePP; } else { strs <- paste(strs, valuePP, sep=""); } valuePP <- NULL; # Not needed anymore } value <- strs; strs <- NULL; # Not needed anymore } else { ccs <- 1:size; value <- raw[ccs,,drop=FALSE]; raw <- raw[-ccs,,drop=FALSE]; value <- readBin(con=value, what=what, size=size, signed=signed, endian="big", n=nbrOfRows); } table[[cc]] <- value; colsOffset <- colsOffset + size; } # for (cc ...) # Turn into a data frame attr(table, "row.names") <- .set_row_names(length(table[[1]])); attr(table, "names") <- names; class(table) <- "data.frame"; dataSet$table <- table; dataSet; } # .readCcgDataSet() ############################################################################ # HISTORY: # 2012-05-18 # o Now using stop() instead of throw(). # 2011-11-01 # o CLEANUP: Changed signed=FALSE to signed=TRUE for readBin() calls # reading 4-byte integers in internal .readCcgDataGroupHeader() and # .readCcgDataSet(). # 2009-02-10 # o Added internal rawToString() replacing rawToChar() to avoid warnings # on "truncating string with embedded nul". # 2008-08-23 # o SPEED UP: Removed all gc() calls. # 2008-01-13 # o Removed dependency on intToChar() in R.utils. # o BUG FIX/UPDATE: The file format was updated between April 2006 and # November 2007. More specifically, the so called "Value Types" were # changed/corrected. Before values 7:9 were 'DOUBLE', 'STRING', and # 'WSTRING'. Now 7:8 are 'STRING' and 'WSTRING' and there is no longer # a 'DOUBLE'. # This was detected while trying to read a CNCHP file outputted by the # new Affymetrix Genotyping Console 2.0. We can now read these files. # 2007-08-16 # o Now it is only readCcg() and readCcgHeader() that are public. The # other readCcgNnn() functions are renamed to .readCcgNnn(). # o Now the read data is converted according to the mime type. See internal # readWVT() function. The code is still ad hoc, so it is not generic. # For instance, it basically assumes that Unicode strings only contain # ASCII/ASCII-8 characters. # 2006-11-06 # o Tested on Test3-1-121502.calvin.CEL and Test3-1-121502.calvin.CDF. # o Created. ############################################################################