R/sampleInfoFromGEO.R
2ca938f1
 sampleInfoFromGEO <- function(file, remove.constant.columns=TRUE)
25be9bfe
 # Parse sample information from a GEO series matrix file into character matrices
 # Gordon Smyth
 # Created 7 Nov 2021.
 {
   con <- file(file, "r")  
   on.exit(close(con))
   i <- 0
   repeat {
     i <- i+1
     txt <- readLines(con,n=1)
     if(!length(txt)) stop("Sample information not found in file. Input should be a GEO series matrix file.")
     if(identical(substring(txt,1,8),"!Sample_")) break
   }
   Lines <- list()
   i <- 0
   repeat {
     i <- i+1
     Lines[[i]] <- txt
     txt <- readLines(con,n=1)
     if(!length(txt)) break
     if(!identical(substring(txt,1,8),"!Sample_")) break
   }
   Lines <- lapply(Lines,function(x) strsplit(x,split="\t")[[1]])
   NSamples <- lengths(Lines)
   Short <- which(NSamples < max(NSamples))
   if(length(Short)) for (i in Short) Lines[[i]] <- NULL
   NSamples <- NSamples[1] - 1L
   Names <- vapply(Lines, FUN=function(x) substring(x[1],9,nchar(x[1])), FUN.VALUE="")
   Lines <- vapply(Lines, FUN=function(x) gsub("\"","",x[-1]), FUN.VALUE=character(NSamples))
   colnames(Lines) <- Names
   rownames(Lines) <- 1:NSamples
 
 # Parse characteristics columns
   i <- grep("characteristics_ch1",colnames(Lines))
   if(length(i)) {
     CharacteristicsCh1 <- .parseGEOSampleCharacteristics(Lines[,i,drop=FALSE])
     Lines <- Lines[,-i]
   } else {
     CharacteristicsCh1 <- NULL
   }
   i <- grep("characteristics_ch2",colnames(Lines))
   if(length(i)) {
     CharacteristicsCh2 <- .parseGEOSampleCharacteristics(Lines[,i,drop=FALSE])
     Lines <- Lines[,-i]
   } else {
     CharacteristicsCh2 <- NULL
   }
 
   if(remove.constant.columns && length(Lines) > 0L) {
     jl <- .columnsAllEqual(Lines)
     if(any(jl)) Lines <- Lines[,!jl,drop=FALSE]
   }
   if(remove.constant.columns && length(CharacteristicsCh1) > 0L) {
     jl <- .columnsAllEqual(CharacteristicsCh1)
     if(any(jl)) CharacteristicsCh1 <- CharacteristicsCh1[,!jl,drop=FALSE]
   }
   if(remove.constant.columns && length(CharacteristicsCh2) > 0L) {
     jl <- .columnsAllEqual(CharacteristicsCh2)
     if(any(jl)) CharacteristicsCh2 <- CharacteristicsCh2[,!jl,drop=FALSE]
   }
 
   list(SampleInfo=Lines,CharacteristicsCh1=CharacteristicsCh1,CharacteristicsCh2=CharacteristicsCh2)
 }
 
 .parseGEOSampleCharacteristics <- function(x)
 # Parse the sample characteristics rows of a GEO series matrix file
 # Gordon Smyth
 # Created 7 Nov 2021.
 {
   nsamples <- nrow(x)
   characteristic.value <- strsplit2(x,split=": ")
   characteristic <- characteristic.value[,1]
   value <- characteristic.value[,2]
   Characteristics <- unique(characteristic)
   Characteristics <- Characteristics[Characteristics != ""]
   ncharacteristics <- length(Characteristics)
   targets <- matrix(NA_character_,nsamples,ncharacteristics)
   rownames(targets) <- 1:nsamples
   colnames(targets) <- Characteristics
   sample <- rep(1:nsamples,ncol(x))
   for (i in 1:length(x)) {
     if(value[i] != "") targets[sample[i],characteristic[i]] <- value[i]
   }
   targets
 }
 
 .columnsAllEqual <- function(x)
 # Identify which columns of a matrix have all elements the same
 # Gordon Smyth
 # Created 7 Nov 2021.
 {
   x <- as.matrix(x)
   n <- nrow(x)
   if(n <= 1L) return(rep_len(TRUE,ncol(x)))
   Eq <- x[1:(n-1),,drop=FALSE] == x[2:n,,drop=FALSE]
   Eq[is.na(Eq)] <- FALSE
   colSums(Eq) == (n-1L)
 }