sampleInfoFromGEO <- function(file, remove.constant.columns=TRUE)
# Parse sample information from a GEO series matrix file into character matrices
# Gordon Smyth
# Created 7 Nov 2021.
{
  con <- file(file, "r")  
  on.exit(close(con))
  i <- 0
  repeat {
    i <- i+1
    txt <- readLines(con,n=1)
    if(!length(txt)) stop("Sample information not found in file. Input should be a GEO series matrix file.")
    if(identical(substring(txt,1,8),"!Sample_")) break
  }
  Lines <- list()
  i <- 0
  repeat {
    i <- i+1
    Lines[[i]] <- txt
    txt <- readLines(con,n=1)
    if(!length(txt)) break
    if(!identical(substring(txt,1,8),"!Sample_")) break
  }
  Lines <- lapply(Lines,function(x) strsplit(x,split="\t")[[1]])
  NSamples <- lengths(Lines)
  Short <- which(NSamples < max(NSamples))
  if(length(Short)) for (i in Short) Lines[[i]] <- NULL
  NSamples <- NSamples[1] - 1L
  Names <- vapply(Lines, FUN=function(x) substring(x[1],9,nchar(x[1])), FUN.VALUE="")
  Lines <- vapply(Lines, FUN=function(x) gsub("\"","",x[-1]), FUN.VALUE=character(NSamples))
  colnames(Lines) <- Names
  rownames(Lines) <- 1:NSamples

# Parse characteristics columns
  i <- grep("characteristics_ch1",colnames(Lines))
  if(length(i)) {
    CharacteristicsCh1 <- .parseGEOSampleCharacteristics(Lines[,i,drop=FALSE])
    Lines <- Lines[,-i]
  } else {
    CharacteristicsCh1 <- NULL
  }
  i <- grep("characteristics_ch2",colnames(Lines))
  if(length(i)) {
    CharacteristicsCh2 <- .parseGEOSampleCharacteristics(Lines[,i,drop=FALSE])
    Lines <- Lines[,-i]
  } else {
    CharacteristicsCh2 <- NULL
  }

  if(remove.constant.columns && length(Lines) > 0L) {
    jl <- .columnsAllEqual(Lines)
    if(any(jl)) Lines <- Lines[,!jl,drop=FALSE]
  }
  if(remove.constant.columns && length(CharacteristicsCh1) > 0L) {
    jl <- .columnsAllEqual(CharacteristicsCh1)
    if(any(jl)) CharacteristicsCh1 <- CharacteristicsCh1[,!jl,drop=FALSE]
  }
  if(remove.constant.columns && length(CharacteristicsCh2) > 0L) {
    jl <- .columnsAllEqual(CharacteristicsCh2)
    if(any(jl)) CharacteristicsCh2 <- CharacteristicsCh2[,!jl,drop=FALSE]
  }

  list(SampleInfo=Lines,CharacteristicsCh1=CharacteristicsCh1,CharacteristicsCh2=CharacteristicsCh2)
}

.parseGEOSampleCharacteristics <- function(x)
# Parse the sample characteristics rows of a GEO series matrix file
# Gordon Smyth
# Created 7 Nov 2021.
{
  nsamples <- nrow(x)
  characteristic.value <- strsplit2(x,split=": ")
  characteristic <- characteristic.value[,1]
  value <- characteristic.value[,2]
  Characteristics <- unique(characteristic)
  Characteristics <- Characteristics[Characteristics != ""]
  ncharacteristics <- length(Characteristics)
  targets <- matrix(NA_character_,nsamples,ncharacteristics)
  rownames(targets) <- 1:nsamples
  colnames(targets) <- Characteristics
  sample <- rep(1:nsamples,ncol(x))
  for (i in 1:length(x)) {
    if(value[i] != "") targets[sample[i],characteristic[i]] <- value[i]
  }
  targets
}

.columnsAllEqual <- function(x)
# Identify which columns of a matrix have all elements the same
# Gordon Smyth
# Created 7 Nov 2021.
{
  x <- as.matrix(x)
  n <- nrow(x)
  if(n <= 1L) return(rep_len(TRUE,ncol(x)))
  Eq <- x[1:(n-1),,drop=FALSE] == x[2:n,,drop=FALSE]
  Eq[is.na(Eq)] <- FALSE
  colSums(Eq) == (n-1L)
}