Bioconductor Code: pepStat

Raw Blame Patch Log History
#' peptideSet constructor
#'
#' This function reads GenePix results (.gpr) files and creates a peptideSet object
#' gathering experiment information.
#'
#' @param files A \code{character} vector. If NULL, all files with a .gpr extension
#' in the specified path will be read.
#' @param path A \code{character} string. The directory where the .gpr files to
#' read are located.
#' @param mapping.file A \code{character} string or \code{data.frame}. A mapping file
#' that gives information for each sample. See details section below for a list of
#' required fields.
#' @param use.flags A \code{logical}. Should spots with flag value -99 or lower
#' be excluded?
#' @param rm.control.list A \code{character} vector. The name of the controls to
#' be excluded from the peptideSet.
#' @param empty.control.list A \code{character} vector. The name of the empty
#' controls. If non NULL, the intensity of these empty spots will be substracted
#' from remaining intensities.
#' @param bgCorrect.method A \code{character} string. The name of the method used
#' for background correction. This is passed to limma's backgroundCorrect method.
#' See details section below and ?backgroundCorrect for more information.
#' @param log A \code{logical}. If TRUE, intensities will be log2 transformed after
#' BG correction.
#' @param check.row.order A \code{logical}. Should slides be reduced to a common
#' set of peptides?
#' @param verbose A \code{logical}. Displays progress and additional information.
#'
#' @details
#' GenePix results files (.gpr) are read when found in either the files or path
#' arguments. By default, the foreground and background median intensities of the
#' "red" channels, "F635 Median" and "B635 Median", are read. The background
#' correction specified in bgCorrect.method is passed to the backgroundCorrect
#' method in the limma package.
#'
#' The mapping.file can be either a filename or a data.frame. In any case, it should
#' contain at least three columns labeled "filename", "ptid" and "visit". The
#' filenames given here should match those read from the path or files argument,
#' or be a subset of it. For each ptid (patient ID), the visit column should have at
#' least one "pre" and  one "post" sample. Any additional column will be kept into
#' the resulting \code{peptideSet} and can be used later on for groupping.
#'
#' If check.row.order = TRUE, the final set of probes is taken to be those with
#' IDs found in all arrays that were read.
#'
#' Row, Column and Block spatial array position for each probe are stored in the
#' \code{featureRanges} slot of the returned object.
#'
#'
#' @return A \code{peptideSet} object that contain the intensities, peptide
#' sequences and annotations available in the mapping file.
#'
#' @examples
#' # Read gpr files
#' library(pepDat)
#' mapFile <- system.file("extdata/mapping.csv", package = "pepDat")
#' dirToParse <- system.file("extdata/gpr_samples", package = "pepDat")
#' pSet <- makePeptideSet(files = NULL, path = dirToParse,
#'                        mapping.file = mapFile, log=TRUE)
#'
#' # Specify controls to be removed and empty controls
#' # to be used for background correction.
#' pSet <- makePeptideSet(files = NULL, path = dirToParse,
#'                        mapping.file = mapFile, log = TRUE,
#'                        rm.control.list = c("JPT-control", "Ig", "Cy3"),
#'                        empty.control.list= c("empty", "blank control"))
#'
#' @seealso \code{\link{peptideSet}}, \code{\link{read.maimages}},
#' \code{\link{backgroundCorrect}}
#'
#' @author Raphael Gottardo, Gregory Imholte
#'
#' @rdname makePeptideSet
#' @export
#' @importFrom tools file_path_sans_ext file_ext
#' @importFrom limma read.maimages backgroundCorrect
#' @importFrom Biobase preproc preproc<- assayData phenoData experimentData
#'   protocolData sampleNames sampleNames<- pData pData<- exprs exprs<- rowMedians
#' @importFrom IRanges IRanges space
#' @importFrom GenomicRanges GRanges
#'
makePeptideSet<-function(files=NULL, path=NULL, mapping.file=NULL, use.flags=FALSE,
                         rm.control.list=NULL, empty.control.list=NULL,
                         bgCorrect.method="normexp", log=TRUE, check.row.order=FALSE,
                         verbose=FALSE){
  # There is some ambiguity with respect to what is Name and ID
  # ID -> peptide
  # Name -> annotation
  f <- function(x) as.numeric(x$Flags > -99)

  # before reading in files, check whether mapping.file is accessible,
  # to save user time in case they made a mistake.
  if (!is.null(mapping.file)){
    mapping.file<-.sanitize_mapping_file2(mapping.file)
    files <- file_path_sans_ext(mapping.file$filename)
  }

  if (!check.row.order) { # Assume that the design is the same and don't check rows, order, etc.
    RG <- read.maimages(files=files,
                        source="genepix", path=path, ext="gpr",
                        columns=list(R="F635 Median",Rb="B635 Median"),
                        wt.fun=f,verbose=verbose)
  } else { # Used if the arrays don't exactly contain the same feature (e.g. the design has changed)
    files <- grep("gpr",list.files(path),value=TRUE)
    RG.list <- lapply(files, read.maimages, source="genepix",
                      path=path, columns=list(R="F635 Median",Rb="B635 Median"),
                      wt.fun=f, verbose=verbose)
    if(verbose){
      cat("Reordering all peptides\n")
    }

    RG <- RG.list[[1]]
    # Find the common target
    target.id <- Reduce(intersect,lapply(RG.list,function(x){x$genes$ID}))
    if(length(target.id)==0){
      stop("No common features found across slides")
    }

    # subset all
    RG.list <- lapply(RG.list,function(x, target){
      ind <- x$genes$ID%in%target;
      x$genes <- x$genes[ind,];
      x$Eb <- x$Eb[ind];
      x$E <- x$E[ind];
      x
    }, target.id)

    # order all
    RG.list <- lapply(RG.list, function(x, target){
      ind <- order(x$genes$ID);
      x$genes <- x$genes[ind,];
      x$Eb <- x$Eb[ind];
      x$E <- x$E[ind];
      x
    })
    RG$E <- do.call(cbind,lapply(RG.list,function(x){x$E}))
    RG$Eb <- do.call(cbind,lapply(RG.list,function(x){x$Eb}))
    RG$targets <- do.call(rbind,lapply(RG.list,function(x){x$targets}))
    # Make sure the sample names are consitent across objects
    colnames(RG$E) <- rownames(RG$targets)
    colnames(RG$Eb) <- rownames(RG$targets)
    RG$genes <- RG.list[[1]]$genes

  }

  offset <- 0.5
  if (bgCorrect.method=="half") offset <- .5 else offset <- 1
  RG <- try(backgroundCorrect(RG, method=bgCorrect.method, offset=offset, verbose=verbose))

  myDesc <- new("MIAME")

  ## Put NA instead of flags
  if (use.flags)
  {
    RG$E[RG$weights==0] <- NA
  }

  # Keep track of printer and source
  preproc(myDesc)$source<-RG$source
  preproc(myDesc)$printer<-RG$printer

  ## Fill in the details of the preprocessing

  ## Put NA instead of flags
  if (use.flags)
  {
    RG$E[RG$weights==0] <- NA
  }
  if (log) {
    RG$E <- log2(RG$E)
    preproc(myDesc) <- c(preproc(myDesc),list(transformation="log", normalization="none"))
  } else {
    preproc(myDesc) <- c(preproc(myDesc),list(transformation="none", normalization="none"))
  }

  if(!is.null(empty.control.list)){
    norm.empty <- TRUE
  } else{
    norm.empty <- FALSE
  }
  preproc(myDesc)$bgCorrect.method <- bgCorrect.method
  preproc(myDesc)$norm.empty <- norm.empty

  if (norm.empty) {
    #Check both name and ID for the control list
    index <- RG$genes$Name%in%empty.control.list | RG$genes$ID%in%empty.control.list
    if(sum(index)==0){
      warning("No empty controls matching the given list were found.")
      mean.empty <- rep(0, ncol(as.matrix(RG$E)))
    } else{
      if (verbose) {
        cat("** Background corrected using ", sum(index), " empty splots **\n")
      }
      mean.empty <- matrix(colMeans(as.matrix(RG$E[index,])),
                           nrow=nrow(as.matrix(RG$E)),
                           ncol=ncol(as.matrix(RG$E)),
                           byrow=TRUE)
    }
  } else {
    mean.empty <- rep(0, ncol(as.matrix(RG$E)))
  }

  # Keep the layout
  layout <- lapply(RG$genes[, c("Block","Row","Column")],as.factor)

  # Remove controls
  ind.keep <- rep(TRUE,nrow(RG$E))
  if (!is.null(rm.control.list)) {
    ind.keep <- lapply(rm.control.list,
                       function(x, Name, ID){
                         !grepl(x,Name) & !grepl(x,ID)},
                       as.character(RG$genes$Name),
                       as.character(RG$genes$ID))
    ind.keep <- do.call(cbind,ind.keep)
    ind.keep <- apply(ind.keep,1,all)
  }

  ## See note above about Name and ID
  # ID -> peptide
  # Name -> annotation
  featureSequence <- as.character(RG$genes$ID)[ind.keep]
  featureID <- as.character(RG$genes$Name)[ind.keep]
  nPep <- length(which(ind.keep))
  # Positions are set to zero before the information is provided in summarize pSet
  pSet <- new('peptideSet',
              featureRange = GRanges(seqnames = " ", strand = "*",
              ranges = IRanges(rep(0,nPep),rep(0,nPep)), featureID,
              peptide = featureSequence, block = layout$Block[ind.keep],
              row = layout$Row[ind.keep], column = layout$Column[ind.keep]),
              exprs = as.matrix(RG$E-mean.empty)[ind.keep, ],
              experimentData = myDesc)

  # Make sure everything is stored as lower
  sampleNames(pSet)<-tolower(sampleNames(pSet))

  if(!is.null(mapping.file)){
    pData(pSet) <- mapping.file[match(sampleNames(pSet), rownames(mapping.file)), ]
  }
  return(pSet)
}

.sanitize_mapping_file2 <- function(mapping.file){
  if(is.character(mapping.file)){
    map <- read.csv(mapping.file, header=TRUE)
  } else if(is.data.frame(mapping.file)){
    map <- mapping.file
  } else {
    stop("The mapping file should be a character vector or a data.frame")
  }
  colnames(map) <- tolower(colnames(map))
  if(!all(c("filename", "ptid", "visit") %in% colnames(map))){
    stop("The mapping file should include at least the 3 mandatory columns: 'filename', 'ptid' and 'visit'")
  }
  row.names(map) <- file_path_sans_ext(tolower(map$filename))
  return(map)
}