Browse code

# o Removed dependency on intToChar() in R.utils. # o BUG FIX/UPDATE: The file format was updated between April 2006 and # November 2007. More specifically, the so called "Value Types" were # changed/corrected. Before values 7:9 were 'DOUBLE', 'STRING', and # 'WSTRING'. Now 7:8 are 'STRING' and 'WSTRING' and there is no longer # a 'DOUBLE'. # This was detected while trying to read a CNCHP file outputted by the # new Affymetrix Genotyping Console 2.0. We can now read these files.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/affxparser@29522 bc3139a8-67e5-0310-9ffc-ced21a209358

Henrik Bengtsson authored on 13/01/2008 23:59:44
Showing 1 changed files

... ...
@@ -96,6 +96,7 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
96 96
     hasFilter <- TRUE;
97 97
   }
98 98
 
99
+
99 100
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
100 101
   # Open file
101 102
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
... ...
@@ -198,6 +199,7 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
198 199
     names <- character(dataGroupHeader$nbrOfDataSets);
199 200
     for (kk in seq(along=dss)) { 
200 201
       ds <- .readCcgDataSet(con, fileOffset=offset); 
202
+
201 203
       offset <- ds$nextDataSetStart; 
202 204
       dss[[kk]] <- ds;
203 205
       names[kk] <- ds$name;
... ...
@@ -208,7 +210,6 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
208 210
       header = dataGroupHeader,
209 211
       dataSets = dss
210 212
     );
211
-
212 213
     dataGroups <- c(dataGroups, list(dataGroup));
213 214
   } # while (nextDataGroupStart != 0)
214 215
   names(dataGroups) <- unlist(lapply(dataGroups, FUN=function(dg) {
... ...
@@ -238,8 +239,7 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
238 239
       return("");
239 240
     bfr <- readBin(con, what=raw(), n=2*nchars);
240 241
     bfr <- bfr[seq(from=2, to=length(bfr), by=2)];
241
-    bfr <- as.integer(bfr);
242
-    bfr <- intToChar(bfr);
242
+    bfr <- rawToChar(bfr, multiple=TRUE);
243 243
     bfr <- paste(bfr, collapse="");
244 244
     bfr;
245 245
   }
... ...
@@ -300,14 +300,13 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
300 300
   # 4 	INT
301 301
   # 5 	UINT
302 302
   # 6 	FLOAT
303
-  # 7 	DOUBLE
304
-  # 8 	STRING
305
-  # 9 	WSTRING
303
+  # 7 	STRING
304
+  # 8 	WSTRING
306 305
   whats <- c("integer", "integer", "integer", "integer", "integer", 
307
-            "integer", "double", "double", "character", "character");
308
-  names(whats) <- c("BYTE", "UBYTE", "SHORT", "USHORT", "INT", "UINT", "FLOAT", "DOUBLE", "STRING", "WSTRING");
309
-  signeds <- c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE);
310
-  sizes <- c(1, 1, 2, 2, 4, 4, 4, 8, 1, 2);
306
+            "integer", "double", "character", "character");
307
+  names(whats) <- c("BYTE", "UBYTE", "SHORT", "USHORT", "INT", "UINT", "FLOAT", "STRING", "WSTRING");
308
+  signeds <- c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE);
309
+  sizes <- c(1, 1, 2, 2, 4, 4, 4, 1, 2);
311 310
 
312 311
   # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
313 312
   # Local functions
... ...
@@ -337,8 +336,7 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
337 336
       return("");
338 337
     bfr <- readBin(con, what=raw(), n=2*nchars);
339 338
     bfr <- bfr[seq(from=2, to=length(bfr), by=2)];
340
-    bfr <- as.integer(bfr);
341
-    bfr <- intToChar(bfr);
339
+    bfr <- rawToChar(bfr, multiple=TRUE);
342 340
     bfr <- paste(bfr, collapse="");
343 341
     bfr;
344 342
   }
... ...
@@ -352,7 +350,7 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
352 350
 
353 351
   readWVT <- function(con, ...) {
354 352
     name <- readWString(con);
355
-    param <- readRaw(con);
353
+    raw <- readRaw(con);
356 354
     type <- readWString(con);
357 355
 
358 356
     # Update data types
... ...
@@ -364,7 +362,11 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
364 362
     # * text/x-calvin-unsigned-integer-32
365 363
     # * text/x-calvin-float
366 364
     # * text/plain
365
+
367 366
     n <- length(raw);
367
+
368
+#    cat(sprintf("Reading n=%d records of type '%s' named '%s'.\n", n, type, name));
369
+
368 370
     value <- switch(type, 
369 371
       "text/ascii" = {
370 372
         rawToChar(raw);
... ...
@@ -387,23 +389,23 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
387 389
       },
388 390
 
389 391
       "text/x-calvin-integer-16" = {
390
-        readBin(raw, what=integer(0), endian="big", size=2, signed=TRUE, n=n/2);
392
+        readBin(raw, what=integer(0), endian="big", size=2, signed=TRUE, n=n);
391 393
       },
392 394
 
393 395
       "text/x-calvin-unsigned-integer-16" = {
394
-        readBin(raw, what=integer(0), endian="big", size=2, signed=FALSE, n=n/2);
396
+        readBin(raw, what=integer(0), endian="big", size=2, signed=FALSE, n=n);
395 397
       },
396 398
 
397 399
       "text/x-calvin-integer-32" = {
398
-        readBin(raw, what=integer(0), endian="big", size=4, signed=TRUE, n=n/4);
400
+        readBin(raw, what=integer(0), endian="big", size=4, signed=TRUE, n=n);
399 401
       },
400 402
 
401 403
       "text/x-calvin-unsigned-integer-32" = {
402
-        readBin(raw, what=integer(0), endian="big", size=4, signed=FALSE, n=n/4);
404
+        readBin(raw, what=integer(0), endian="big", size=4, signed=FALSE, n=n);
403 405
       },
404 406
 
405 407
       "text/x-calvin-float" = {
406
-        readBin(raw, what=double(0), endian="big", size=4, n=n/4);
408
+        readBin(raw, what=double(0), endian="big", size=4, n=n);
407 409
       },
408 410
 
409 411
       {
... ...
@@ -459,7 +461,6 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
459 461
     nextDataSetStart=readUInt(con),
460 462
     name=readWString(con)
461 463
   )
462
-
463 464
   # Reading parameters
464 465
   nbrOfParams <- readInt(con);
465 466
   params <- vector("list", nbrOfParams);
... ...
@@ -492,6 +493,7 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
492 493
     attr(what, "size") <- size;
493 494
     colWhats[[cc]] <- what;
494 495
   }
496
+  names(colWhats) <- names;
495 497
   bytesPerRow <- as.integer(bytesPerRow);
496 498
 
497 499
   nbrOfRows <- readUInt(con);
... ...
@@ -511,23 +513,59 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
511 513
     size <- attr(what, "size");
512 514
 
513 515
     if (what == "character") {
514
-      rawCol <- matrix(raw[1:4,], nrow=nbrOfRows, ncol=4);
515
-      nchars <- readInt(con=rawCol, n=nbrOfRows);
516
-      nchars <- nchars[1];
517
-      ccs <- colsOffset + 4 + seq(from=2, to=2*nchars, by=2);
518
-      value <- intToChar(as.integer(raw[ccs,]));
516
+      value <- matrix(raw[1:4,], nrow=nbrOfRows, ncol=4);
517
+      raw <- raw[-c(1:4),,drop=FALSE];
518
+
519
+      # Get the number of characters per string (all equal)
520
+##      nchars <- readInt(con=value, n=nbrOfRows);
521
+##      nchars <- nchars[1];
522
+      nchars <- readInt(con=value, n=1);
523
+      rm(value);
524
+      gc <- gc();
525
+
526
+      ccs <- 1:(size-4);
527
+      value <- raw[ccs,];
528
+      raw <- raw[-ccs,,drop=FALSE];
529
+      gc <- gc();
530
+      value <- rawToChar(value, multiple=TRUE);
519 531
       dim(value) <- c(nchars, nbrOfRows);
520
-      value <- apply(value, MARGIN=2, paste, collapse="");
532
+      gc <- gc();
533
+
534
+      # Build strings using vectorization (not apply()!)
535
+      strs <- NULL;
536
+      for (pp in seq(length=nrow(value))) {
537
+        valuePP <- value[1,,drop=FALSE];
538
+        value <- value[-1,,drop=FALSE];
539
+#        gc <- gc();
540
+        if (pp == 1) {
541
+          strs <- valuePP;
542
+        } else {
543
+          strs <- paste(strs, valuePP, sep="");
544
+        }
545
+        rm(valuePP);
546
+      }
547
+      value <- strs;
548
+      rm(strs);
521 549
     } else {
522
-      ccs <- colsOffset + 1:size;
523
-      rawCol <- matrix(raw[ccs,], nrow=bytesPerRow, ncol=nbrOfRows, byrow=FALSE);
524
-      value <- readBin(con=rawCol, what=what, size=size, signed=signed, endian="big", n=nbrOfRows);
550
+      ccs <- 1:size;
551
+      value <- raw[ccs,,drop=FALSE];
552
+      raw <- raw[-ccs,,drop=FALSE];
553
+      gc <- gc();
554
+      value <- readBin(con=value, what=what, size=size, signed=signed, endian="big", n=nbrOfRows);
525 555
     }
556
+
557
+    # Garbage collect
558
+    gc <- gc();
559
+
526 560
     table[[cc]] <- value;
527 561
     colsOffset <- colsOffset + size;
528 562
   }
529
-  table <- as.data.frame(table);
530
-  colnames(table) <- names;
563
+
564
+  # Turn into a data frame
565
+  attr(table, "row.names") <- .set_row_names(length(table[[1]]));
566
+  attr(table, "names") <- names;
567
+  class(table) <- "data.frame";
568
+
531 569
   dataSet$table <- table;
532 570
 
533 571
   dataSet;
... ...
@@ -536,6 +574,15 @@ readCcg <- function(pathname, verbose=0, .filter=NULL, ...) {
536 574
 
537 575
 ############################################################################
538 576
 # HISTORY:
577
+# 2008-01-13
578
+# o Removed dependency on intToChar() in R.utils.
579
+# o BUG FIX/UPDATE: The file format was updated between April 2006 and
580
+#   November 2007.  More specifically, the so called "Value Types" were
581
+#   changed/corrected.  Before values 7:9 were 'DOUBLE', 'STRING', and
582
+#   'WSTRING'.  Now 7:8 are 'STRING' and 'WSTRING' and there is no longer
583
+#   a 'DOUBLE'.
584
+#   This was detected while trying to read a CNCHP file outputted by the
585
+#   new Affymetrix Genotyping Console 2.0.  We can now read these files.
539 586
 # 2007-08-16
540 587
 # o Now it is only readCcg() and readCcgHeader() that are public.  The
541 588
 #   other readCcgNnn() functions are renamed to .readCcgNnn().