#' peptideSet constructor #' #' This function reads GenePix results (.gpr) files and creates a peptideSet object #' gathering experiment information. #' #' @param files A \code{character} vector. If NULL, all files with a .gpr extension #' in the specified path will be read. #' @param path A \code{character} string. The directory where the .gpr files to #' read are located. #' @param mapping.file A \code{character} string or \code{data.frame}. A mapping file #' that gives information for each sample. See details section below for a list of #' required fields. #' @param use.flags A \code{logical}. Should spots with flag value -99 or lower #' be excluded? #' @param rm.control.list A \code{character} vector. The name of the controls to #' be excluded from the peptideSet. #' @param empty.control.list A \code{character} vector. The name of the empty #' controls. If non NULL, the intensity of these empty spots will be substracted #' from remaining intensities. #' @param bgCorrect.method A \code{character} string. The name of the method used #' for background correction. This is passed to limma's backgroundCorrect method. #' See details section below and ?backgroundCorrect for more information. #' @param log A \code{logical}. If TRUE, intensities will be log2 transformed after #' BG correction. #' @param check.row.order A \code{logical}. Should slides be reduced to a common #' set of peptides? #' @param verbose A \code{logical}. Displays progress and additional information. #' #' @details #' GenePix results files (.gpr) are read when found in either the files or path #' arguments. By default, the foreground and background median intensities of the #' "red" channels, "F635 Median" and "B635 Median", are read. The background #' correction specified in bgCorrect.method is passed to the backgroundCorrect #' method in the limma package. #' #' The mapping.file can be either a filename or a data.frame. In any case, it should #' contain at least three columns labeled "filename", "ptid" and "visit". The #' filenames given here should match those read from the path or files argument, #' or be a subset of it. For each ptid (patient ID), the visit column should have at #' least one "pre" and one "post" sample. Any additional column will be kept into #' the resulting \code{peptideSet} and can be used later on for groupping. #' #' If check.row.order = TRUE, the final set of probes is taken to be those with #' IDs found in all arrays that were read. #' #' Row, Column and Block spatial array position for each probe are stored in the #' \code{featureRanges} slot of the returned object. #' #' #' @return A \code{peptideSet} object that contain the intensities, peptide #' sequences and annotations available in the mapping file. #' #' @examples #' # Read gpr files #' library(pepDat) #' mapFile <- system.file("extdata/mapping.csv", package = "pepDat") #' dirToParse <- system.file("extdata/gpr_samples", package = "pepDat") #' pSet <- makePeptideSet(files = NULL, path = dirToParse, #' mapping.file = mapFile, log=TRUE) #' #' # Specify controls to be removed and empty controls #' # to be used for background correction. #' pSet <- makePeptideSet(files = NULL, path = dirToParse, #' mapping.file = mapFile, log = TRUE, #' rm.control.list = c("JPT-control", "Ig", "Cy3"), #' empty.control.list= c("empty", "blank control")) #' #' @seealso \code{\link{peptideSet}}, \code{\link{read.maimages}}, #' \code{\link{backgroundCorrect}} #' #' @author Raphael Gottardo, Gregory Imholte #' #' @rdname makePeptideSet #' @export #' @importFrom tools file_path_sans_ext file_ext #' @importFrom limma read.maimages backgroundCorrect #' @importFrom Biobase preproc preproc<- assayData phenoData experimentData #' protocolData sampleNames sampleNames<- pData pData<- exprs exprs<- rowMedians #' @importFrom IRanges IRanges space #' @importFrom GenomicRanges GRanges #' makePeptideSet<-function(files=NULL, path=NULL, mapping.file=NULL, use.flags=FALSE, rm.control.list=NULL, empty.control.list=NULL, bgCorrect.method="normexp", log=TRUE, check.row.order=FALSE, verbose=FALSE){ # There is some ambiguity with respect to what is Name and ID # ID -> peptide # Name -> annotation f <- function(x) as.numeric(x$Flags > -99) # before reading in files, check whether mapping.file is accessible, # to save user time in case they made a mistake. if (!is.null(mapping.file)){ mapping.file<-.sanitize_mapping_file2(mapping.file) files <- file_path_sans_ext(mapping.file$filename) } if (!check.row.order) { # Assume that the design is the same and don't check rows, order, etc. RG <- read.maimages(files=files, source="genepix", path=path, ext="gpr", columns=list(R="F635 Median",Rb="B635 Median"), wt.fun=f,verbose=verbose) } else { # Used if the arrays don't exactly contain the same feature (e.g. the design has changed) files <- grep("gpr",list.files(path),value=TRUE) RG.list <- lapply(files, read.maimages, source="genepix", path=path, columns=list(R="F635 Median",Rb="B635 Median"), wt.fun=f, verbose=verbose) if(verbose){ cat("Reordering all peptides\n") } RG <- RG.list[[1]] # Find the common target target.id <- Reduce(intersect,lapply(RG.list,function(x){x$genes$ID})) if(length(target.id)==0){ stop("No common features found across slides") } # subset all RG.list <- lapply(RG.list,function(x, target){ ind <- x$genes$ID%in%target; x$genes <- x$genes[ind,]; x$Eb <- x$Eb[ind]; x$E <- x$E[ind]; x }, target.id) # order all RG.list <- lapply(RG.list, function(x, target){ ind <- order(x$genes$ID); x$genes <- x$genes[ind,]; x$Eb <- x$Eb[ind]; x$E <- x$E[ind]; x }) RG$E <- do.call(cbind,lapply(RG.list,function(x){x$E})) RG$Eb <- do.call(cbind,lapply(RG.list,function(x){x$Eb})) RG$targets <- do.call(rbind,lapply(RG.list,function(x){x$targets})) # Make sure the sample names are consitent across objects colnames(RG$E) <- rownames(RG$targets) colnames(RG$Eb) <- rownames(RG$targets) RG$genes <- RG.list[[1]]$genes } offset <- 0.5 if (bgCorrect.method=="half") offset <- .5 else offset <- 1 RG <- try(backgroundCorrect(RG, method=bgCorrect.method, offset=offset, verbose=verbose)) myDesc <- new("MIAME") ## Put NA instead of flags if (use.flags) { RG$E[RG$weights==0] <- NA } # Keep track of printer and source preproc(myDesc)$source<-RG$source preproc(myDesc)$printer<-RG$printer ## Fill in the details of the preprocessing ## Put NA instead of flags if (use.flags) { RG$E[RG$weights==0] <- NA } if (log) { RG$E <- log2(RG$E) preproc(myDesc) <- c(preproc(myDesc),list(transformation="log", normalization="none")) } else { preproc(myDesc) <- c(preproc(myDesc),list(transformation="none", normalization="none")) } if(!is.null(empty.control.list)){ norm.empty <- TRUE } else{ norm.empty <- FALSE } preproc(myDesc)$bgCorrect.method <- bgCorrect.method preproc(myDesc)$norm.empty <- norm.empty if (norm.empty) { #Check both name and ID for the control list index <- RG$genes$Name%in%empty.control.list | RG$genes$ID%in%empty.control.list if(sum(index)==0){ warning("No empty controls matching the given list were found.") mean.empty <- rep(0, ncol(as.matrix(RG$E))) } else{ if (verbose) { cat("** Background corrected using ", sum(index), " empty splots **\n") } mean.empty <- matrix(colMeans(as.matrix(RG$E[index,])), nrow=nrow(as.matrix(RG$E)), ncol=ncol(as.matrix(RG$E)), byrow=TRUE) } } else { mean.empty <- rep(0, ncol(as.matrix(RG$E))) } # Keep the layout layout <- lapply(RG$genes[, c("Block","Row","Column")],as.factor) # Remove controls ind.keep <- rep(TRUE,nrow(RG$E)) if (!is.null(rm.control.list)) { ind.keep <- lapply(rm.control.list, function(x, Name, ID){ !grepl(x,Name) & !grepl(x,ID)}, as.character(RG$genes$Name), as.character(RG$genes$ID)) ind.keep <- do.call(cbind,ind.keep) ind.keep <- apply(ind.keep,1,all) } ## See note above about Name and ID # ID -> peptide # Name -> annotation featureSequence <- as.character(RG$genes$ID)[ind.keep] featureID <- as.character(RG$genes$Name)[ind.keep] nPep <- length(which(ind.keep)) # Positions are set to zero before the information is provided in summarize pSet pSet <- new('peptideSet', featureRange = GRanges(seqnames = " ", strand = "*", ranges = IRanges(rep(0,nPep),rep(0,nPep)), featureID, peptide = featureSequence, block = layout$Block[ind.keep], row = layout$Row[ind.keep], column = layout$Column[ind.keep]), exprs = as.matrix(RG$E-mean.empty)[ind.keep, ], experimentData = myDesc) # Make sure everything is stored as lower sampleNames(pSet)<-tolower(sampleNames(pSet)) if(!is.null(mapping.file)){ pData(pSet) <- mapping.file[match(sampleNames(pSet), rownames(mapping.file)), ] } return(pSet) } .sanitize_mapping_file2 <- function(mapping.file){ if(is.character(mapping.file)){ map <- read.csv(mapping.file, header=TRUE) } else if(is.data.frame(mapping.file)){ map <- mapping.file } else { stop("The mapping file should be a character vector or a data.frame") } colnames(map) <- tolower(colnames(map)) if(!all(c("filename", "ptid", "visit") %in% colnames(map))){ stop("The mapping file should include at least the 3 mandatory columns: 'filename', 'ptid' and 'visit'") } row.names(map) <- file_path_sans_ext(tolower(map$filename)) return(map) }