Bioconductor Code: TCGAbiolinks

R/prepare.R

178c3b9a	#' @title Prepare GDC data #' @description #' Reads the data downloaded and prepare it into an R object #' @param query A query for GDCquery function #' @param save Save result as RData object? #' @param save.filename Name of the file to be save if empty an automatic will be created
c1b5e11c	#' @param directory Directory/Folder where the data was downloaded. Default: GDCdata
67c15b4c	#' @param summarizedExperiment Create a summarizedExperiment? Default TRUE (if possible)
c1b5e11c	#' @param remove.files.prepared Remove the files read? Default: FALSE #' This argument will be considered only if save argument is set to true
48fb8c4e	#' @param add.gistic2.mut If a list of genes (gene symbol) is given, columns with gistic2 results from GDAC firehose (hg19) #' and a column indicating if there is or not mutation in that gene (hg38) #' (TRUE or FALSE - use the MAF file for more information) #' will be added to the sample matrix in the summarized Experiment object.
a58ac18c	#' @param mutant_variant_classification List of mutant_variant_classification that will be #' consider a sample mutant or not. Default: "Frame_Shift_Del", "Frame_Shift_Ins", #' "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del", #' "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation"
178c3b9a	#' @export
88ef1778	#' @examples
093df0db	#' \dontrun{
f22676b7	#' query <- GDCquery( #' project = "TCGA-KIRP", #' data.category = "Simple Nucleotide Variation",
4413c90f	#' data.type = "Masked Somatic Mutation"
f22676b7	#' )
88ef1778	#' GDCdownload(query, method = "api", directory = "maf") #' maf <- GDCprepare(query, directory = "maf") #'
a77d212e	#' }
178c3b9a	#' @return A summarizedExperiment or a data.frame
ee9a77b7	#' @importFrom S4Vectors DataFrame
e7804dce	#' @importFrom SummarizedExperiment metadata<-
eba132ca	#' @importFrom data.table setcolorder setnames
9a62fdd6	#' @importFrom GenomicRanges GRanges #' @importFrom IRanges IRanges
28ae53ec	#' @importFrom purrr map_chr
02c318ae	#' @author Tiago Chedraoui Silva
afcb2a28	GDCprepare <- function(
045b7659	query, save = FALSE, save.filename, directory = "GDCdata", summarizedExperiment = TRUE, remove.files.prepared = FALSE, add.gistic2.mut = NULL, mutant_variant_classification = c( "Frame_Shift_Del", "Frame_Shift_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del", "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation"
919dc166	)
045b7659	){ isServeOK() if(missing(query)) stop("Please set query parameter")
b2bc706f	test.duplicated.cases <- ( any( duplicated(query$results[[1]]$cases)) & # any duplicated !(query$data.type %in% c( "Clinical data", "Protein expression quantification", "Raw intensities", "Masked Intensities", "Clinical Supplement", "Masked Somatic Mutation", "Biospecimen Supplement" ) ) )
045b7659	if(test.duplicated.cases) { dup <- query$results[[1]]$cases[duplicated(query$results[[1]]$cases)] cols <- c("tags","cases","experimental_strategy","analysis_workflow_type") cols <- cols[cols %in% colnames(query$results[[1]])] dup <- query$results[[1]][query$results[[1]]$cases %in% dup,cols] dup <- dup[order(dup$cases),] print(knitr::kable(dup)) stop("There are samples duplicated. We will not be able to prepare it") } if (!save & remove.files.prepared) { stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost") }
b2bc706f	# We save the files in project/data.category/data.type/file_id/file_name
045b7659	files <- file.path(
b2bc706f	query$results[[1]]$project,
afcb2a28	gsub(" ","_",query$results[[1]]$data_category), gsub(" ","_",query$results[[1]]$data_type),
045b7659	gsub(" ","_",query$results[[1]]$file_id),
afcb2a28	gsub(" ","_",query$results[[1]]$file_name)
38297df4	)
045b7659	files <- file.path(directory, files) # For IDAT prepare since we need to put all IDATs in the same folder the code below will not work # a second run if (!all(file.exists(files))) {
b2bc706f	# We have to check we moved the files if ( unique(query$results[[1]]$data_type) == "Masked Intensities" \| unique(query$results[[1]]$data_category) == "Raw microarray data" ){
045b7659	files.idat <- file.path(
b2bc706f	query$results[[1]]$project,
045b7659	gsub(" ","_",query$results[[1]]$data_category), gsub(" ","_",query$results[[1]]$data_type), gsub(" ","_",query$results[[1]]$file_name) )
b2bc706f
045b7659	files.idat <- file.path(directory, files.idat) if (!all(file.exists(files) \| file.exists(files.idat))) { stop( paste0( "I couldn't find all the files from the query. ", "Please check if the directory parameter is right ", "or `GDCdownload` downloaded the samples." ) ) } } else { stop( paste0( "I couldn't find all the files from the query. ", "Please check if the directory parameter is right ", "or `GDCdownload` downloaded the samples." ) ) }
178c3b9a	}
045b7659	cases <- ifelse(
c4457d9c	grepl("TCGA\|TARGET\|CGCI-HTMCP-CC\|CPTAC-2\|BEATAML1.0-COHORT",query$results[[1]]$project %>% unlist()),
045b7659	query$results[[1]]$cases, query$results[[1]]$sample.submitter_id ) if (grepl("Transcriptome Profiling", query$data.category, ignore.case = TRUE)){ if(unique(query$results[[1]]$experimental_strategy) == "scRNA-Seq"){ #if (grepl("Single Cell Analysis", unique(query$results[[1]]$data_type), ignore.case = TRUE)){ data <- readSingleCellAnalysis( files = files, data_format = unique(query$results[[1]]$data_format), workflow.type = unique(query$results[[1]]$analysis_workflow_type), cases = cases ) return(data) } else { data <- readTranscriptomeProfiling( files = files, data.type = ifelse(!is.na(query$data.type), as.character(query$data.type), unique(query$results[[1]]$data_type)), workflow.type = unique(query$results[[1]]$analysis_workflow_type), cases = cases, summarizedExperiment ) } } else if(grepl("Copy Number Variation",query$data.category,ignore.case = TRUE)) { if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number Scores") { data <- readGISTIC(files, query$results[[1]]$cases) } else if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number") {
b2bc706f	data <- read_gene_level_copy_number( files = files, cases = query$results[[1]]$sample.submitter_id, summarizedExperiment = summarizedExperiment )
045b7659	} else {
b2bc706f	data <- read_copy_number_variation( files = files, cases = query$results[[1]]$cases )
045b7659	} } else if (grepl("Methylation Beta Value",unique(query$results[[1]]$data_type), ignore.case = TRUE)) { data <- readDNAmethylation( files = files, cases = cases, summarizedExperiment = summarizedExperiment,
c8404f5f	platform = unique(query$results[[1]]$platform)
38297df4	)
045b7659	} else if (grepl("Raw intensities\|Masked Intensities",query$data.type, ignore.case = TRUE)) { # preparing IDAT files data <- readIDATDNAmethylation( files = files, barcode = cases, summarizedExperiment = summarizedExperiment,
c8404f5f	platform = unique(query$results[[1]]$platform)
38297df4	)
b2bc706f	} else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) {
045b7659	data <- readProteomeProfiling(files, cases = cases)
b2bc706f	} else if (grepl("Protein expression",query$data.category,ignore.case = TRUE)) {
045b7659	data <- readProteinExpression(files, cases = cases)
b2bc706f	if (summarizedExperiment) {
045b7659	message("SummarizedExperiment not implemented, if you need samples metadata use the function TCGAbiolinks:::colDataPrepare") } } else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) {
b2bc706f	if (grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){
045b7659	data <- readSimpleNucleotideVariationMaf(files) } } else if (grepl("Clinical\|Biospecimen", query$data.category, ignore.case = TRUE)){
b2bc706f	data <- read_clinical(files, query$data.type, cases = cases)
045b7659	summarizedExperiment <- FALSE } else if (grepl("Gene expression",query$data.category,ignore.case = TRUE)) { if (query$data.type == "Gene expression quantification")
b2bc706f	data <- read_gene_expression_quantification(
045b7659	files = files, cases = cases, summarizedExperiment = summarizedExperiment,
c8404f5f	genome = "hg38",
045b7659	experimental.strategy = unique(query$results[[1]]$experimental_strategy) ) if (query$data.type == "miRNA gene quantification")
b2bc706f	data <- read_gene_expression_quantification(
045b7659	files = files, cases = cases, summarizedExperiment = FALSE,
c8404f5f	genome = "hg38",
045b7659	experimental.strategy = unique(query$results[[1]]$experimental_strategy) ) if (query$data.type == "miRNA isoform quantification") data <- readmiRNAIsoformQuantification( files = files, cases = query$results[[1]]$cases ) if (query$data.type == "Isoform expression quantification") data <- readIsoformExpressionQuantification(files = files, cases = cases) if (query$data.type == "Exon quantification") data <- readExonQuantification( files = files, cases = cases, summarizedExperiment = summarizedExperiment )
38297df4	}
045b7659	# Add data release to object if (summarizedExperiment & !is.data.frame(data)) { metadata(data) <- list("data_release" = getGDCInfo()$data_release)
38297df4	}
045b7659	if ((!is.null(add.gistic2.mut)) & summarizedExperiment) { message("=> Adding GISTIC2 and mutation information....") genes <- tolower(levels(EAGenes$Gene)) if (!all(tolower(add.gistic2.mut) %in% genes)) { message( paste("These genes were not found:\n", paste(add.gistic2.mut[! tolower(add.gistic2.mut) %in% genes],collapse = "\n=> ") ) ) } add.gistic2.mut <- add.gistic2.mut[tolower(add.gistic2.mut) %in% tolower(genes)] if (length(add.gistic2.mut) > 0){ info <- colData(data) for(i in unlist(query$project)){ info <- get.mut.gistc.information( info, i, add.gistic2.mut, mutant_variant_classification = mutant_variant_classification ) } colData(data) <- info } } if("samples" %in% colnames(data)){ if(any(duplicated(data$sample))) { message("Replicates found.") if(any(data$is_ffpe)) message("FFPE should be removed. You can modify the data with the following command:\ndata <- data[,!data$is_ffpe]") print(as.data.frame(colData(data)[data$sample %in% data$sample[duplicated(data$sample)],c("is_ffpe"),drop = FALSE])) }
38297df4	}
045b7659	if(save){ if(missing(save.filename) & !missing(query)) save.filename <- paste0(query$project,gsub(" ","_", query$data.category),gsub(" ","_",date()),".RData") message(paste0("=> Saving file: ",save.filename)) save(data, file = save.filename) message("=> File saved") # save is true, due to the check in the beggining of the code if(remove.files.prepared){ # removes files and empty directories
b2bc706f	remove_files_recursively(files)
045b7659	} } return(data)
178c3b9a	}
b2bc706f	remove_files_recursively <- function(files){
045b7659	files2rm <- dirname(files) unlink(files2rm,recursive = TRUE) files2rm <- dirname(files2rm) # data category
b2bc706f	if(length(list.files(files2rm)) == 0) remove_files_recursively(files2rm)
c1b5e11c	}
26eb7e14
b2bc706f	read_clinical <- function(files, data.type, cases){
045b7659	if(data.type == "Clinical data"){ suppressMessages({ ret <- plyr::alply(files,.margins = 1,readr::read_tsv, .progress = "text") }) names(ret) <- gsub("nationwidechildrens.org_","",gsub(".txt","",basename(files))) } else if(data.type %in% c("Clinical Supplement","Biospecimen Supplement")){ ret <- plyr::alply(files,.margins = 1,function(f) { readr::read_tsv(f,col_types = readr::cols()) }, .progress = "text") names(ret) <- gsub("nationwidechildrens.org_","",gsub(".txt","",basename(files))) } return(ret)
5dab05e3	}
cc200045
f1d96802
f22676b7	readSingleCellAnalysis <- function(
045b7659	files = files, data_format = NULL, workflow.type = NULL, cases = cases
f22676b7	) {
045b7659	if(data_format == "MEX" & workflow.type == "CellRanger - 10x Filtered Counts"){ check_package("Seurat") ret <- plyr::llply(files,.fun = function(f){ untar(tarfile = f,exdir = dirname(f)) Seurat::Read10X(data.dir = gsub("\\.tar\\.gz","",f)) },.progress = "time") names(ret) <- cases } if(data_format == "MEX" & workflow.type == "CellRanger - 10x Raw Counts"){ ret <- plyr::llply(files,.fun = function(f){ # uncompress raw file untar(tarfile = f,exdir = dirname(f)) Read10X(data.dir = gsub("\\.tar\\.gz","",f)) },.progress = "time") names(ret) <- cases } # TSV files if(data_format == "TSV"){ ret <- plyr::llply(files,.fun = function(f){ readr::read_tsv(f) },.progress = "time") names(ret) <- cases } if(data_format == "HDF5"){ stop("We are not preparing loom files") # check_package("SeuratDisk") # check_package("Seurat") # ret <- SeuratDisk::Connect(filename = files, mode = "r") # ret <- Seurat::as.Seurat(ret) print(files) } # HDF5 return(ret)
f22676b7	}
9e56ad43	#' @importFrom tidyr separate
b2bc706f	readExonQuantification <- function ( files, cases, summarizedExperiment = TRUE ){
045b7659	pb <- txtProgressBar(min = 0, max = length(files), style = 3) assay.list <- NULL for (i in seq_along(files)) { data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE) if(!missing(cases)) { assay.list <- gsub(" \|\$\|\$\|\\/","_",colnames(data)[2:ncol(data)]) # We will use this because there might be more than one col for each samples setnames(data,colnames(data)[2:ncol(data)], paste0(gsub(" \|\$\|\$\|\\/","_",colnames(data)[2:ncol(data)]),"_",cases[i])) } if (i == 1) { df <- data } else { df <- merge(df, data, by=colnames(data)[1], all = TRUE) }
b2bc706f
045b7659	setTxtProgressBar(pb, i)
f1d96802	}
045b7659	setDF(df) rownames(df) <- df[,1] df <- df %>% separate(exon,into = c("seqnames","coordinates","strand"),sep = ":") %>% separate(coordinates,into = c("start","end"),sep = "-")
b2bc706f
045b7659	if(summarizedExperiment) { suppressWarnings({ assays <- lapply(assay.list, function (x) { return(data.matrix(subset(df, select = grep(x,colnames(df),ignore.case = TRUE)))) }) }) names(assays) <- assay.list
b2bc706f	regex <- paste0( "[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}", "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}" )
045b7659	samples <- na.omit(unique(str_match(colnames(df),regex)[,1])) colData <- colDataPrepare(samples) assays <- lapply(assays, function(x){ colnames(x) <- NULL rownames(x) <- NULL return(x) }) rowRanges <- makeGRangesFromDataFrame(df) message("Available assays in SummarizedExperiment : \n => ",paste(names(assays), collapse = "\n => ")) rse <- SummarizedExperiment( assays = assays, rowRanges = rowRanges, colData = colData ) return(rse)
f1d96802	}
045b7659	return(df)
f1d96802	}
cc200045	readIsoformExpressionQuantification <- function (files, cases){
045b7659	pb <- txtProgressBar(min = 0, max = length(files), style = 3) for (i in seq_along(files)) { data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE) if(!missing(cases)) { assay.list <- gsub(" \|\$\|\$\|\\/","_",colnames(data)[2:ncol(data)]) # We will use this because there might be more than one col for each samples setnames(data,colnames(data)[2:ncol(data)], paste0(gsub(" \|\$\|\$\|\\/","_",colnames(data)[2:ncol(data)]),"_",cases[i])) } if (i == 1) { df <- data } else { df <- merge(df, data, by=colnames(data)[1], all = TRUE) } setTxtProgressBar(pb, i)
38297df4	}
045b7659	setDF(df) rownames(df) <- df[,1] df[,1] <- NULL return(df)
cc200045	}
93f1c154	readmiRNAIsoformQuantification <- function (files, cases){
045b7659	pb <- txtProgressBar(min = 0, max = length(files), style = 3) for (i in seq_along(files)) { data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE) data$barcode <- cases[i] if (i == 1) { df <- data } else { df <- rbind(df, data) } setTxtProgressBar(pb, i)
93f1c154	}
045b7659	setDF(df)
93f1c154	}
3fc605c6
88ef1778	readSimpleNucleotideVariationMaf <- function(files){
045b7659
3fc605c6	ret <- files \|> purrr::map_dfr(.f = function(x) { tab <- readr::read_tsv( x, show_col_types = FALSE,
045b7659	comment = "#",
3fc605c6	col_types = readr::cols( SOMATIC = col_character(), PUBMED = col_character(), miRNA = col_character(), HGVS_OFFSET = col_integer(), PHENO = col_character(),
045b7659	Entrez_Gene_Id = col_integer(), Start_Position = col_integer(), End_Position = col_integer(), t_depth = col_integer(), t_ref_count = col_integer(), t_alt_count = col_integer(), n_depth = col_integer(), TRANSCRIPT_STRAND = col_integer(), PICK = col_integer(), TSL = col_integer(),
c1b9dfef	Allele = col_character(),
36436f05	Tumor_Seq_Allele1 = col_character(), Reference_Allele = col_character(),
c1b9dfef	Tumor_Seq_Allele2 = col_character(),
3fc605c6	DISTANCE = col_integer() )) # empty MAF file # https://portal.gdc.cancer.gov/files/7917fcbe-cb66-447d-8ea8-3a324feee3fa if(nrow(tab) == 0) {return (NULL)} tab
045b7659	})
3fc605c6
045b7659	return(ret)
88ef1778	}
b2bc706f	read_gene_expression_quantification <- function(
045b7659	files, cases, genome = "hg19", summarizedExperiment = TRUE, experimental.strategy, platform
e84eacd4	){
045b7659	skip <- unique((ifelse(experimental.strategy == "Gene expression array",1,0))) if(length(skip) > 1) stop("It is not possible to handle those different platforms together") print.header(paste0("Reading ", length(files)," files"),"subsection") ret <- plyr::alply( .data = seq_along(files), .margins = 1, .fun = function(i,cases){ data <- fread( input = files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE, skip = skip ) if(!missing(cases)) { assay.list <<- gsub(" \|\$\|\$\|\\/","_",colnames(data)[2:ncol(data)]) # We will use this because there might be more than one col for each samples setnames( data, colnames(data)[2:ncol(data)], paste0(gsub(" \|\$\|\$\|\\/","_",colnames(data)[2:ncol(data)]),"_",cases[i]) ) } data },.progress = "time",cases = cases) print.header(paste0("Merging ", length(files)," files"),"subsection") # Just check if the data is in the same order, since we will not merge # the data frames to save memory stopifnot(all(unlist(ret %>% map(function(y){all(y[,1] == ret[[1]][,1])}) ))) # need to check if it works in all cases df <- ret %>% map( ~ (.x %>% dplyr::select(-1))) %>% bind_cols() df <- bind_cols(ret[[1]][,1],df) if (summarizedExperiment) {
b2bc706f	df <- make_se_from_gene_exoression_quantification(df, assay.list, genome = genome)
045b7659	} else { rownames(df) <- df$gene_id df$gene_id <- NULL } return(df)
14236a24	}
e84eacd4
b2bc706f	make_se_from_gene_exoression_quantification <- function(
045b7659	df, assay.list, genome = "hg19"
18fd2879	){
045b7659	# Access genome information to create SE gene.location <- get.GRCh.bioMart(genome) if(all(grepl("\\\|",df[[1]]))){ aux <- strsplit(df$gene_id,"\\\|") GeneID <- unlist(lapply(aux,function(x) x[2])) df$entrezgene_id <- as.numeric(GeneID) gene.location <- gene.location[!duplicated(gene.location$entrezgene_id),] df <- merge(df, gene.location, by = "entrezgene_id") } else { df$external_gene_name <- as.character(df[[1]]) df <- merge(df, gene.location, by = "external_gene_name") } if("transcript_id" %in% assay.list){ rowRanges <- GRanges( seqnames = paste0("chr", df$chromosome_name), ranges = IRanges(start = df$start_position, end = df$end_position), strand = df$strand, gene_id = df$external_gene_name, entrezgene = df$entrezgene_id, ensembl_gene_id = df$ensembl_gene_id, transcript_id = subset(df, select = 5)
18fd2879	)
045b7659	names(rowRanges) <- as.character(df$gene_id) assay.list <- assay.list[which(assay.list != "transcript_id")] } else { rowRanges <- GRanges( seqnames = paste0("chr", df$chromosome_name), ranges = IRanges( start = df$start_position, end = df$end_position ), strand = df$strand, gene_id = df$external_gene_name, entrezgene = df$entrezgene_id, ensembl_gene_id = df$ensembl_gene_id ) names(rowRanges) <- as.character(df$external_gene_name) } suppressWarnings({ assays <- lapply(assay.list, function(x) { return( data.matrix( subset(df, select = grep(x,colnames(df),ignore.case = TRUE)) ) ) }) }) names(assays) <- assay.list regex <- paste0("[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}", "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}") samples <- na.omit(unique(str_match(colnames(df),regex)[,1])) colData <- colDataPrepare(samples) assays <- lapply(assays, function(x){ colnames(x) <- NULL rownames(x) <- NULL return(x)
02d1bac3	})
045b7659	message("Available assays in SummarizedExperiment : \n => ",paste(names(assays), collapse = "\n => ")) rse <- SummarizedExperiment( assays = assays, rowRanges = rowRanges, colData = colData ) return(rse)
14236a24	}
178c3b9a
ad5fd34d	#' @importFrom downloader download #' @importFrom S4Vectors DataFrame
8b67c89a	makeSEFromDNAMethylationMatrix <- function(
045b7659	betas, genome = "hg38", met.platform = c( "Illumina Human Methylation 450", "Illumina Human Methylation 27", "Illumina Methylation Epic" )
38297df4	) {
045b7659	message("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-") message("Creating a SummarizedExperiment from DNA methylation input") # Instead of looking on the size, it is better to set it as a argument as the annotation is different annotation <- getMetPlatInfo(platform = met.platform, genome = genome) rowRanges <- annotation[names(annotation) %in% rownames(betas),,drop = FALSE]
5b7bcdb4	colData <- tryCatch({ colDataPrepare(colnames(betas)) }, error = function(e){ DataFrame(samples = colnames(betas)) })
045b7659	betas <- betas[rownames(betas) %in% names(rowRanges),,drop = FALSE] betas <- betas[names(rowRanges),,drop = FALSE] assay <- data.matrix(betas) betas <- SummarizedExperiment( assays = assay, rowRanges = rowRanges, colData = colData ) return(betas)
ad5fd34d	}
7ab7fc99	makeSEfromDNAmethylation <- function(df, probeInfo = NULL){
045b7659	if(is.null(probeInfo)) { rowRanges <- GRanges( seqnames = paste0("chr", df$Chromosome), ranges = IRanges(start = df$Genomic_Coordinate, end = df$Genomic_Coordinate), probeID = df$Composite.Element.REF, Gene_Symbol = df$Gene_Symbol ) names(rowRanges) <- as.character(df$Composite.Element.REF) colData <- colDataPrepare(colnames(df)[5:ncol(df)]) assay <- data.matrix(subset(df,select = c(5:ncol(df)))) } else { rowRanges <- makeGRangesFromDataFrame(probeInfo, keep.extra.columns = TRUE) colData <- colDataPrepare(colnames(df)[(ncol(probeInfo) + 1):ncol(df)]) assay <- data.matrix(subset(df,select = c((ncol(probeInfo) + 1):ncol(df)))) } colnames(assay) <- rownames(colData) rownames(assay) <- as.character(df$Composite.Element.REF) rse <- SummarizedExperiment(assays = assay, rowRanges = rowRanges, colData = colData)
178c3b9a	}
18fd2879	readIDATDNAmethylation <- function(
045b7659	files, barcode, summarizedExperiment,
c8404f5f	platform
18fd2879	) {
045b7659	check_package("sesame") # Check if moved files would be moved outside of scope folder, if so, path doesn't change
c8404f5f	moved.files <- sapply(files,USE.NAMES = FALSE,function(x){
045b7659	if (grepl("Raw_intensities\|Masked_Intensities",dirname(dirname(x)))) { return(file.path(dirname(dirname(x)), basename(x))) } return(x) }) # for each file move it to upper parent folder if necessary plyr::a_ply(files, 1,function(x){ if (grepl("Raw_intensities\|Masked_Intensities",dirname(dirname(x)))) { tryCatch( move(x, file.path(dirname(dirname(x)), basename(x)), keep.copy = FALSE ), error = function(e){ }) } }) samples <- unique(gsub("_Grn.idat\|_Red.idat","",moved.files)) message("Processing IDATs with Sesame - http://bioconductor.org/packages/sesame/") message("Running opensesame - applying quality masking and nondetection masking (threshold P-value 0.05)") message("Please cite: doi: 10.1093/nar/gky691 and 10.1093/nar/gkt090") message("This might take a while....") betas <- sesame::openSesame(samples) %>% as.matrix barcode <- unique(data.frame("file" = gsub("_Grn.idat\|_Red.idat","",basename(moved.files)), "barcode" = barcode)) colnames(betas) <- barcode$barcode[match(basename(samples),barcode$file)] if (summarizedExperiment) { betas <- makeSEFromDNAMethylationMatrix( betas = betas,
c8404f5f	genome ="hg38",
045b7659	met.platform = platform ) colData(betas) <- DataFrame(colDataPrepare(colnames(betas)))
38297df4	}
045b7659	return(betas)
ad5fd34d	}
178c3b9a	# We will try to make this function easier to use this function in its own data # In case it is not TCGA I should not consider that there is a barcode in the header # Instead the user should be able to add the names to his data # The only problem is that the data from the user will not have all the columns # TODO: Improve this function to be more generic as possible
0d7a13b9	#' @importFrom GenomicRanges makeGRangesFromDataFrame #' @importFrom tibble as_data_frame
7fe66781	#' @importFrom data.table fread
acdc3fa8	readDNAmethylation <- function(
045b7659	files, cases, summarizedExperiment = TRUE,
c8404f5f	platform
acdc3fa8	){
045b7659	if(length(platform) > 1){ print(knitr::kable(platform)) stop("More than one DNA methylation platform found. Only one is accepted")
38297df4	}
045b7659	if (missing(cases)) cases <- NULL if (grepl("OMA00",platform)) { pb <- txtProgressBar(min = 0, max = length(files), style = 3) for (i in seq_along(files)) { data <- fread( files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE, skip = 1, na.strings = "N/A", colClasses = c( "character", # Composite Element REF "numeric" # # beta value ) ) setnames(data,gsub(" ", "\\.", colnames(data))) if (!is.null(cases)) setnames(data,2,cases[i]) if (i == 1) { df <- data } else { df <- merge(df, data, by = "Composite.Element.REF") } setTxtProgressBar(pb, i) } setDF(df) rownames(df) <- df$Composite.Element.REF df$Composite.Element.REF <- NULL } else if (all(grepl("methylation_array.sesame.level3betas", files))){ # methylation_array.sesame.level3betas has only two columns with not header print.header(paste0("Reading ", length(files)," files"),"subsection") x <- plyr::alply(files,1, function(f) { data <- fread( f, header = FALSE, sep = "\t", stringsAsFactors = FALSE, skip = 0, colClasses = c( "character", # CpG "numeric" # beta value ) ) setnames(data,gsub(" ", "\\.", colnames(data))) if (!is.null(cases)) setnames(data,2,cases[which(f == files)]) }, .progress = "time") print.header(paste0("Merging ", length(files)," files"),"subsection") # Just check if the data is in the same order, since we will not merge # the data frames to save memory
93fd0bdc	# C3N-01362-01 has less probes than C3L-00913-03 due to version of the array # CPTAC cohort: 55 samples with EPIC v1 and 1808 with EPIC V2 # EPIC v1 has less probes than EPIC V2 # In case we have multiple versions of the array we will # 1. Check the names of all of the same nrow # 2. Bind the ones with same nrow # 3. Full join the two objects nrow_vec <- purrr::map_int(x,nrow) # number of rows each file nrow_vec_numbers <- nrow_vec %>% unique # unique number of rows for(nb_rows in nrow_vec_numbers){ aux <- x[which(nrow_vec == nb_rows)] stopifnot(all(unlist(aux %>% map(function(y){all(y[,1]$V1 == aux[[1]][,1]$V1)}) ))) }
045b7659
93fd0bdc	df <- map(nrow_vec_numbers,.f = function(nb_rows){ file_equal_probes <- x[which(nrow_vec == nb_rows)] df <- file_equal_probes %>% map_df(2) colnames(df) <- file_equal_probes %>% map_chr(.f = function(y) colnames(y)[2]) df$V1 <- file_equal_probes[[1]]$V1
84cf1d0b	if (any(duplicated(colnames(df)))){ message("oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo") message("Duplicated samples names were found. Adding _rep suffix to name") message("oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo") } colnames(df)[duplicated(colnames(df))] <- paste0(colnames(df)[duplicated(colnames(df))],"_rep")
93fd0bdc	df }) %>% purrr::reduce(dplyr::full_join,by = "V1") %>% as.data.frame() rownames(df) <- df$V1 df$V1 <- NULL
045b7659	df <- data.matrix(df) if (summarizedExperiment) { df <- makeSEFromDNAMethylationMatrix( betas = df,
c8404f5f	genome = "hg38",
045b7659	met.platform = platform ) }
38297df4	} else {
045b7659	skip <- ifelse(all(grepl("hg38",files)), 0,1) colClasses <- NULL if (!all(grepl("hg38",files))) { colClasses <- c( "character", # Composite Element REF "numeric", # beta value "character", # Gene symbol "character", # Chromosome "integer" ) } x <- plyr::alply(files,1, function(f) { data <- fread( f, header = TRUE, sep = "\t", stringsAsFactors = FALSE, skip = skip, colClasses = colClasses ) setnames(data,gsub(" ", "\\.", colnames(data))) if (!is.null(cases)) setnames(data,2,cases[which(f == files)]) setcolorder(data,c(1, 3:ncol(data), 2)) }, .progress = "time") print.header(paste0("Merging ", length(files)," files"),"subsection") # Just check if the data is in the same order, since we will not merge # the data frames to save memory stopifnot(all(unlist(x %>% map(function(y){all(y[,1] == x[[1]][,1])}) ))) # if hg38 we have 10 columns with probe metadata # if hg19 we have 4 columns with probe metadata idx.dnam <- grep("TCGA",colnames(x[[1]])) df <- x %>% map_df(idx.dnam) colnames(df) <- x %>% map_chr(.f = function(y) colnames(y)[idx.dnam]) df <- bind_cols(x[[1]][,1:(idx.dnam-1)],df) if (summarizedExperiment) { if(skip == 0) { df <- makeSEfromDNAmethylation( df, probeInfo = data.frame(df)[,grep("TCGA",colnames(df),invert = TRUE)] ) } else { df <- makeSEfromDNAmethylation(df) } } else { setDF(df) rownames(df) <- df$Composite.Element.REF df$Composite.Element.REF <- NULL }
38297df4	}
045b7659	return(df)
178c3b9a	}
d568a61d	# Barcode example MMRF_1358_1_BM_CD138pos_T1_TSMRU_L02337 colDataPrepareMMRF <- function(barcode){
045b7659	DataFrame( barcode = barcode, sample = barcode, patient = substr(barcode,1,9) )
d568a61d	}
c6c5016d	colDataPrepareTARGET <- function(barcode){
dc73efdc
045b7659	message("Adding description to TARGET samples") tissue.code <- c( '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '20', '40', '41', '42', '50', '60', '61',
dc73efdc	'85',
045b7659	'99' ) definition <- c( "Primary solid Tumor", # 01 "Recurrent Solid Tumor", # 02 "Primary Blood Derived Cancer - Peripheral Blood", # 03 "Recurrent Blood Derived Cancer - Bone Marrow", # 04 "Additional - New Primary", # 05 "Metastatic", # 06 "Additional Metastatic", # 07 "Tissue disease-specific post-adjuvant therapy", # 08 "Primary Blood Derived Cancer - Bone Marrow", # 09 "Blood Derived Normal", # 10 "Solid Tissue Normal", # 11 "Buccal Cell Normal", # 12 "EBV Immortalized Normal", # 13 "Bone Marrow Normal", # 14 "Fibroblasts from Bone Marrow Normal", # 15 "Mononuclear Cells from Bone Marrow Normal", # 16 "Lymphatic Tissue Normal (including centroblasts)", # 17 "Control Analyte", # 20 "Recurrent Blood Derived Cancer - Peripheral Blood", # 40 "Blood Derived Cancer- Bone Marrow, Post-treatment", # 41 "Blood Derived Cancer- Peripheral Blood, Post-treatment", # 42 "Cell line from patient tumor", # 50 "Xenograft from patient not grown as intermediate on plastic tissue culture dish", # 60 "Xenograft grown in mice from established cell lines", #61
dc73efdc	"Next Generation Cancer Model", # 85
045b7659	"Granulocytes after a Ficoll separation" ) # 99
dc73efdc	aux <- data.frame(tissue.code = tissue.code,definition) # Exceptions: TARGET-AML # "TARGET-20-PAYKXV-Sorted-CD34neg-Lymphoid-09A" 727f5e27-09e9-425f-9757-81da66c054bd # "TARGET-20-PAWUEX-EOI2-14A" 11fb7cce-06ad-445b-b09f-9dc1050e4cd4 # "TARGET-20-PAUSTZ-EOI1-14A" 3ad6dfa7-734d-41b0-ad66-15f0cff630d0 # "TARGET-20-PAYHMK-Sorted-leukemic-09A" f5bd72ae-919f-481c-8203-f878ee1c651a # "TARGET-20-D7-Myeloid-mock3-85" 88712125-67e5-4e47-a413-e55b522a641c # Normal case: TARGET-20-PANLXK-09A-03R
045b7659	# in case multiple equal barcode
dc73efdc	#regex <- paste0("[:alnum:]{6}-[:alnum:]{2}-[:alnum:]{6}", # "-[:alnum:]{3}-[:alnum:]{3}") #samples <- str_match(barcode,regex)[,1]
045b7659
dc73efdc	# this breaks for the exceptions if(!any(grepl(";",barcode))){ ret <- data.frame( barcode = barcode, tumor.code = stringr::str_extract(pattern = "TARGET-[[:alnum:]]{2}",barcode) %>% gsub(pattern = "TARGET-","",.), sample = gsub(pattern = "-[[:alnum:]]{3}$","",barcode), patient = gsub(pattern = "-[[:alnum:]]{2,3}-[[:alnum:]]{3}$","",barcode), case.unique.id = gsub(pattern = "-[[:alnum:]]{2,3}-[[:alnum:]]{3}$","",barcode) %>% gsub(pattern = "TARGET-[[:alnum:]]{2}-","",.), tissue.code = barcode %>% stringr::str_extract("[[:alnum:]]{2,3}-[[:alnum:]]{3}$") %>% stringr::str_extract("[[:alnum:]]{2,3}") %>% stringr::str_sub(1,2), nucleic.acid.code = barcode %>% stringr::str_extract("[[:alnum:]]{3}$") %>% stringr::str_sub(3,3) ) } else { # mixed samples case ret <- purrr::map_dfr(.x = barcode,.f = function(b){ data.frame( barcode = b, tumor.code = stringr::str_extract(pattern = "TARGET-[[:alnum:]]{2}",b) %>% gsub(pattern = "TARGET-","",.), sample = b %>% stringr::str_split(";") %>% unlist %>% sapply(FUN = function(y) gsub(pattern = "-[[:alnum:]]{3}$","",y)) %>% paste0(collapse = ";"), patient = b %>% stringr::str_split(";") %>% unlist %>% sapply(FUN = function(y) gsub(pattern = "-[[:alnum:]]{2,3}-[[:alnum:]]{3}$","",y)) %>% paste0(collapse = ";"), case.unique.id = b %>% stringr::str_split(";") %>% unlist %>% sapply(FUN = function(y) gsub(pattern = "-[[:alnum:]]{2,3}-[[:alnum:]]{3}$","",y) %>% gsub(pattern = "TARGET-[[:alnum:]]{2}-","",.) %>% paste0(collapse = ";")), tissue.code = b %>% stringr::str_extract("[[:alnum:]]{2,3}-[[:alnum:]]{3}$") %>% stringr::str_extract("[[:alnum:]]{2,3}") %>% stringr::str_sub(1,2), nucleic.acid.code = b %>% stringr::str_extract("[[:alnum:]]{3}$") %>% stringr::str_sub(3,3) ) }) } ret <- ret %>% dplyr::left_join(aux, by = "tissue.code")
045b7659	tumor.code <- c('00','01','02','03','04','10','15','20','21','30','40', '41','50','51','52','60','61','62','63','64','65','70','71','80','81') tumor.definition <- c( "Non-cancerous tissue", # 00 "Diffuse Large B-Cell Lymphoma (DLBCL)", # 01 "Lung Cancer (all types)", # 02 "Cervical Cancer (all types)", # 03 "Anal Cancer (all types)", # 04 "Acute lymphoblastic leukemia (ALL)", # 10 "Mixed phenotype acute leukemia (MPAL)", # 15 "Acute myeloid leukemia (AML)", # 20 "Induction Failure AML (AML-IF)", # 21 "Neuroblastoma (NBL)", # 30 "Osteosarcoma (OS)", # 40 "Ewing sarcoma", # 41 "Wilms tumor (WT)", # 50 "Clear cell sarcoma of the kidney (CCSK)", # 51 "Rhabdoid tumor (kidney) (RT)", # 52 "CNS, ependymoma", # 60 "CNS, glioblastoma (GBM)", # 61 "CNS, rhabdoid tumor", # 62 "CNS, low grade glioma (LGG)", # 63 "CNS, medulloblastoma", # 64 "CNS, other", # 65 "NHL, anaplastic large cell lymphoma", # 70 "NHL, Burkitt lymphoma (BL)", # 71 "Rhabdomyosarcoma", #80 "Soft tissue sarcoma, non-rhabdomyosarcoma" ) # 81
dc73efdc	aux <- data.frame(tumor.code = tumor.code,tumor.definition) ret <- ret %>% dplyr::left_join(aux, by = "tumor.code")
045b7659	nucleic.acid.code <- c('D','E','W','X','Y','R','S') nucleic.acid.description <- c( "DNA, unamplified, from the first isolation of a tissue", "DNA, unamplified, from the first isolation of a tissue embedded in FFPE", "DNA, whole genome amplified by Qiagen (one independent reaction)", "DNA, whole genome amplified by Qiagen (a second, separate independent reaction)", "DNA, whole genome amplified by Qiagen (pool of 'W' and 'X' aliquots)", "RNA, from the first isolation of a tissue", "RNA, from the first isolation of a tissue embedded in FFPE" )
dc73efdc	aux <- data.frame(nucleic.acid.code = nucleic.acid.code,nucleic.acid.description) ret <- ret %>% dplyr::left_join(aux, by = "nucleic.acid.code")
045b7659	ret <- ret[match(barcode,ret$barcode),] rownames(ret) <- gsub("\\.","-",make.names(ret$barcode,unique=TRUE)) ret$code <- NULL return(DataFrame(ret))
c6c5016d	} colDataPrepareTCGA <- function(barcode){
045b7659	# For the moment this will work only for TCGA Data # We should search what TARGET data means
c8404f5f	code <- c( '01','02','03','04','05','06','07','08','09','10','11', '12','13','14','20','40','50','60','61' ) shortLetterCode <- c( "TP","TR","TB","TRBM","TAP","TM","TAM","THOC", "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB", "CELL","XP","XCL" ) definition <- c( "Primary solid Tumor", # 01 "Recurrent Solid Tumor", # 02 "Primary Blood Derived Cancer - Peripheral Blood", # 03 "Recurrent Blood Derived Cancer - Bone Marrow", # 04 "Additional - New Primary", # 05 "Metastatic", # 06 "Additional Metastatic", # 07 "Human Tumor Original Cells", # 08 "Primary Blood Derived Cancer - Bone Marrow", # 09 "Blood Derived Normal", # 10 "Solid Tissue Normal", # 11 "Buccal Cell Normal", # 12 "EBV Immortalized Normal", # 13 "Bone Marrow Normal", # 14 "Control Analyte", # 20 "Recurrent Blood Derived Cancer - Peripheral Blood", # 40 "Cell Lines", # 50 "Primary Xenograft Tissue", # 60 "Cell Line Derived Xenograft Tissue" ) # 61
045b7659	aux <- DataFrame(code = code,shortLetterCode,definition) # in case multiple equal barcode regex <- paste0("[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}", "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}") samples <- str_match(barcode,regex)[,1]
c8404f5f	ret <- DataFrame( barcode = barcode, patient = substr(barcode, 1, 12), sample = substr(barcode, 1, 16), code = substr(barcode, 14, 15) )
045b7659	ret <- merge(ret,aux, by = "code", sort = FALSE) ret <- ret[match(barcode,ret$barcode),] rownames(ret) <- gsub("\\.","-",make.names(ret$barcode,unique=TRUE)) ret$code <- NULL return(DataFrame(ret))
178c3b9a	}
cc4e8cfd	#' @title Create samples information matrix for GDC samples
21321b60	#' @description Create samples information matrix for GDC samples add subtype information
a90c7d63	#' @param barcode TCGA or TARGET barcode
f57861ed	#' @importFrom plyr rbind.fill
a4b2a12d	#' @importFrom dplyr left_join
e385d93e	#' @examples
c1e5918e	#' metadata <- colDataPrepare(c("TCGA-OR-A5K3-01A","C3N-00321-01")) #' metadata <- colDataPrepare(c("BLGSP-71-06-00157-01A", #' "BLGSP-71-22-00332-01A"))
a3f2bc5f	#' @export
c6c5016d	colDataPrepare <- function(barcode){
045b7659	# For the moment this will work only for TCGA Data # We should search what TARGET data means message("Starting to add information to samples") ret <- NULL if(all(grepl("TARGET",barcode))) ret <- colDataPrepareTARGET(barcode) if(all(grepl("TCGA",barcode))) ret <- colDataPrepareTCGA(barcode) if(all(grepl("MMRF",barcode))) ret <- colDataPrepareMMRF(barcode) # How to deal with mixed samples "C3N-02003-01;C3N-02003-021" ?
6281eff8	# Check if this breaks the package
045b7659	if(any(grepl("C3N-\|C3L-",barcode))) { ret <- data.frame(
b2bc706f	sample = map(barcode,.f = function(x) stringr::str_split(x,";") %>% unlist) %>% unlist()
045b7659	) }
b2bc706f
045b7659	if(is.null(ret)) { ret <- data.frame( sample = barcode %>% unique, stringsAsFactors = FALSE ) } message(" => Add clinical information to samples") # There is a limitation on the size of the string, so this step will be splited in cases of 100 patient.info <- NULL patient.info <- splitAPICall( FUN = getBarcodeInfo, step = 10, items = ret$sample
38297df4	)
dc73efdc
045b7659	if(!is.null(patient.info)) { ret$sample_submitter_id <- ret$sample %>% as.character() ret <- left_join(ret %>% as.data.frame, patient.info %>% unique, by = "sample_submitter_id") } ret$bcr_patient_barcode <- ret$sample %>% as.character()
7697beda	ret$sample_submitter_id <- ret$sample %>% as.character()
045b7659	if(!"project_id" %in% colnames(ret)) { if("disease_type" %in% colnames(ret)){ aux <- getGDCprojects()[,c(5,7)] aux <- aux[aux$disease_type == unique(ret$disease_type),2] ret$project_id <- as.character(aux) }
7697beda	}
045b7659	# There is no subtype info for target, return as it is if(any(grepl("TCGA",barcode))) { ret <- addSubtypeInfo(ret) } # na.omit should not be here, exceptional case
b2bc706f	if(is.null(ret)) { return( data.frame( row.names = barcode, barcode, stringsAsFactors = FALSE ) ) }
045b7659	# Add purity information from http://www.nature.com/articles/ncomms9971 # purity <- getPurityinfo() # ret <- merge(ret, purity, by = "sample", all.x = TRUE, sort = FALSE) # Put data in the right order ret <- ret[!duplicated(ret$bcr_patient_barcode),] # This part might not work with multiple projects idx <- sapply( X = substr(barcode,1,min(stringr::str_length(ret$bcr_patient_barcode))), FUN = function(x) { grep(x,ret$bcr_patient_barcode) } ) # the code above does not work, since the projects have different string lengths
ad025eb9	if(all(na.omit(ret$project_id) %in% c("TARGET-ALL-P3","TARGET-AML"))) {
045b7659	idx <- sapply(gsub("-[[:alnum:]]{3}$","",barcode), function(x) { grep(x,ret$bcr_patient_barcode) }) } if(any(ret$project_id == "CPTAC-3",na.rm = T)) {
0cb9cae4
f18368c6	# only merge mixed samples
b2bc706f	mixed_samples <- grep(";",barcode,value = T) if(length(mixed_samples) > 0){ mixed_samples <- mixed_samples %>% str_split(";") %>% unlist %>% unique
f18368c6
b2bc706f	ret_mixed_samples <- ret %>% dplyr::filter(sample_submitter_id %in% mixed_samples) %>% dplyr::group_by(submitter_id) %>%
f18368c6	dplyr::summarise_all(~trimws(paste(unique(.), collapse = ';'))) %>% as.data.frame()
b2bc706f	ret <- rbind(ret_mixed_samples,ret)
f18368c6	}
0cb9cae4	idx <- match(barcode,ret$bcr_patient_barcode) #idx <- sapply(gsub("-[[:alnum:]]{3}$","",barcode), function(x) { # if(grepl(";",x = x)) x <- stringr::str_split(x[1],";")[[1]][1] # mixed samples # grep(x,ret$bcr_patient_barcode) #})
045b7659	}
dc73efdc	if(any(ret$project_id %in% c("CMI-MBC","TARGET-NBL"),na.rm = T)) {
045b7659	idx <- match(barcode,ret$bcr_patient_barcode) } if(is.list(idx)){ stop( "Prepare will not be possible. \nIf you are trying to prepare more than one different project at a time, please do it separately" ) } ret <- ret[idx,] if("barcode" %in% colnames(ret)) ret$barcode <- barcode rownames(ret) <- barcode return(ret)
c6c5016d	}
e4b4e205	addSubtypeInfo <- function(ret){
045b7659	out <- NULL message(" => Adding TCGA molecular information from marker papers") message(" => Information will have prefix 'paper_' ") for(proj in na.omit(unique(ret$project_id))){ if(grepl("TCGA",proj,ignore.case = TRUE)) { # remove letter from 01A 01B etc ret$sample.aux <- substr(ret$sample,1,15) tumor <- gsub("TCGA-","",proj) available <- c( "ACC", "BRCA", "BLCA", "CESC", "CHOL", "COAD", "ESCA", "GBM", "HNSC", "KICH", "KIRC", "KIRP", "LGG", "LUAD", "LUSC", "PAAD", "PCPG", "PRAD", "READ", "SKCM", "SARC", "STAD", "THCA", "UCEC", "UCS", "UVM" ) if (grepl(paste(c(available,"all"),collapse = "\|"),tumor,ignore.case = TRUE)) { subtype <- TCGAquery_subtype(tumor) colnames(subtype) <- paste0("paper_", colnames(subtype)) if(all(str_length(subtype$paper_patient) == 12)){ # Subtype information were to primary tumor in priority subtype$sample.aux <- paste0(subtype$paper_patient,"-01") } ret.aux <- ret[ret$sample.aux %in% subtype$sample.aux,] ret.aux <- merge(ret.aux,subtype, by = "sample.aux", all.x = TRUE) out <- rbind.fill(as.data.frame(out),as.data.frame(ret.aux)) }
e4b4e205	} }
045b7659	if(is.null(out)) return(ret) # We need to put together the samples with subtypes with samples without subytpes ret.aux <- ret[!ret$sample %in% out$sample,] ret <- rbind.fill(as.data.frame(out),as.data.frame(ret.aux)) ret$sample.aux <- NULL return(ret)
e4b4e205	}
178c3b9a	readProteinExpression <- function(files,cases) {
045b7659	pb <- txtProgressBar(min = 0, max = length(files), style = 3) for (i in seq_along(files)) { data <- read_tsv(file = files[i], col_names = TRUE,skip = 1, col_types = c("cn")) if(!missing(cases)) colnames(data)[2] <- cases[i] if(i == 1) df <- data if(i != 1) df <- merge(df, data,all=TRUE, by="Composite Element REF") setTxtProgressBar(pb, i) } close(pb) return(df)
178c3b9a	}
785ee06b	readProteomeProfiling <- function(files,cases) {
045b7659	list.of.protein.df <- plyr::alply(files,1, function(f) { data <- readr::read_tsv( file = f, show_col_types = FALSE, col_names = TRUE, progress = FALSE ) colnames(data)[ncol(data)] <- cases[which(f == files)] data }, .progress = "time") # Just check if the data is in the same order, since we will not merge # the data frames to save memory df <- tryCatch({ stopifnot(all(unlist(list.of.protein.df %>% map(function(y){all(y[,5] == list.of.protein.df[[1]][,5])}) ))) # need to check if it works in all cases # tested for HTSeq - FPKM-UQ and Counts only bind_cols(list.of.protein.df[[1]][,1:5],list.of.protein.df %>% map_df(6)) },error = function(e){ message("Some files have a different number of proteins, we will introduce NA for the missing values") plyr::join_all(dfs = list.of.protein.df ,type = "full",by = c("AGID","lab_id","catalog_number","set_id","peptide_target")) }) if(!missing(cases)) colnames(df)[-c(1:5)] <- cases return(df)
785ee06b	} makeSEfromProteomeProfiling <- function(){
045b7659
785ee06b	}
079aab6c
7697beda	makeSEfromTranscriptomeProfilingSTAR <- function(
045b7659	data, cases
7697beda	){
045b7659	# How many cases do we have? # We wil consider col 1 is the ensemble gene id, other ones are data size <- ncol(data) # Prepare Patient table colData <- colDataPrepare(cases) # one ensemblID can be mapped to multiple entrezgene ID gene.location <- get.GRCh.bioMart("hg38",as.granges = TRUE) metrics <- subset(data, !grepl("ENSG", data$gene_id)) data <- subset(data, grepl("ENSG", data$gene_id)) found.genes <- table(data$gene_id %in% gene.location$gene_id) if ("FALSE" %in% names(found.genes)){ message(paste0("From the ", nrow(data), " genes we couldn't map ", found.genes[["FALSE"]])) } # Prepare data table # Remove the version from the ensembl gene id assays <- list( data.matrix(data[,grep("^unstranded",colnames(data)),with = FALSE]), data.matrix(data[,grep("stranded_first",colnames(data)),with = FALSE]), data.matrix(data[,grep("stranded_second",colnames(data)),with = FALSE]), data.matrix(data[,grep("tpm_unstrand",colnames(data)),with = FALSE]), data.matrix(data[,grep("fpkm_unstrand",colnames(data)),with = FALSE]), data.matrix(data[,grep("fpkm_uq_unstrand",colnames(data)),with = FALSE]) ) names(assays) <- c( "unstranded", "stranded_first", "stranded_second", "tpm_unstrand", "fpkm_unstrand", "fpkm_uq_unstrand" ) assays <- lapply(assays, function(x){ colnames(x) <- NULL rownames(x) <- NULL return(x) }) # Prepare rowRanges rowRanges <- gene.location[match(data$gene_id, gene.location$gene_id),] names(rowRanges) <- as.character(data$gene_id) message("Available assays in SummarizedExperiment : \n => ",paste(names(assays), collapse = "\n => ")) rse <- SummarizedExperiment( assays = assays, rowRanges = rowRanges, colData = colData ) metadata(rse) <- metrics return(rse)
079aab6c	}
c6c5016d	makeSEfromTranscriptomeProfiling <- function(data, cases, assay.list){
045b7659	# How many cases do we have? # We wil consider col 1 is the ensemble gene id, other ones are data size <- ncol(data) # Prepare Patient table colData <- colDataPrepare(cases) # one ensemblID can be mapped to multiple entrezgene ID gene.location <- get.GRCh.bioMart("hg38") gene.location <- gene.location[!duplicated(gene.location$ensembl_gene_id),] data$ensembl_gene_id <- as.character(gsub("\\.[0-9]*","",data$X1)) data <- subset(data, grepl("ENSG", data$ensembl_gene_id)) found.genes <- table(data$ensembl_gene_id %in% gene.location$ensembl_gene_id) if("FALSE" %in% names(found.genes)) message(paste0("From the ", nrow(data), " genes we couldn't map ", found.genes[["FALSE"]])) data <- merge(data, gene.location, by = "ensembl_gene_id") # Prepare data table # Remove the version from the ensembl gene id assays <- list(data.matrix(data[,2:size+1])) names(assays) <- assay.list assays <- lapply(assays, function(x){ colnames(x) <- NULL rownames(x) <- NULL return(x) }) # Prepare rowRanges rowRanges <- GRanges( seqnames = paste0("chr", data$chromosome_name), ranges = IRanges( start = data$start_position, end = data$end_position ), strand = data$strand, ensembl_gene_id = data$ensembl_gene_id, external_gene_name = data$external_gene_name, original_ensembl_gene_id = data$X1 ) names(rowRanges) <- as.character(data$ensembl_gene_id) message("Available assays in SummarizedExperiment : \n => ",paste(names(assays), collapse = "\n => ")) rse <- SummarizedExperiment( assays = assays, rowRanges = rowRanges, colData = colData ) return(rse)
c6c5016d	}
6bf60949	#' @importFrom dplyr left_join
d7615b50	#' @importFrom plyr alply join_all
dbee506d	#' @importFrom purrr map_dfc map map_df
d7615b50	readTranscriptomeProfiling <- function(
045b7659	files, data.type, workflow.type, cases, summarizedExperiment
d7615b50	) {
045b7659	if(grepl("Gene Expression Quantification", data.type, ignore.case = TRUE)){ # Status working for: # - htseq # - FPKM # - FPKM-UQ if(grepl("HTSeq",workflow.type)){ x <- plyr::alply(files,1, function(f) { readr::read_tsv( file = f, col_names = FALSE, progress = FALSE, col_types = c("cd") ) }, .progress = "time") # Just check if the data is in the same order, since we will not merge # the data frames to save memory stopifnot(all(unlist(x %>% map(function(y){all(y[,1] == x[[1]][,1])}) ))) # need to check if it works in all cases # tested for HTSeq - FPKM-UQ and Counts only df <- bind_cols(x[[1]][,1],x %>% map_df(2)) if(!missing(cases)) colnames(df)[-1] <- cases if(summarizedExperiment) df <- makeSEfromTranscriptomeProfiling(df,cases,workflow.type) } else if(grepl("STAR",workflow.type)){ # read files that has 4 not necessary rows, and has several columns # gene_id gene_name gene_type # unstranded stranded_first stranded_second tpm_unstranded fpkm_unstranded x <- plyr::alply(files,1, function(f) { data.table::fread(f) }, .progress = "time") df <- data.table::rbindlist( x, use.names = TRUE, idcol = "case_barcode" )
3ff7686f	# Exception to the code below: CPTAC-3 # Sample barcode in CPTAC-3 does not handle duplicates # we will need to work with aliquots for it or remove duplicated # files. Example: C3N-02765-02
045b7659	if(!missing(cases)) { df$case_barcode <- factor( cases[df$case_barcode %>% as.numeric()],
3ff7686f	levels = unique(cases)
045b7659	) } # this part changes the cases order if not a factor df <- data.table::dcast( data = df, formula = gene_id + gene_name + gene_type ~ case_barcode, value.var = colnames(df)[-c(1:4)] ) # Adding barcodes to columns names, if user wants a dataframe if(summarizedExperiment) { df <- makeSEfromTranscriptomeProfilingSTAR( data = df, cases = cases ) } } } else if(grepl("miRNA", workflow.type, ignore.case = TRUE) & grepl("miRNA", data.type, ignore.case = TRUE)) { pb <- txtProgressBar(min = 0, max = length(files), style = 3) for (i in seq_along(files)) { data <- read_tsv(file = files[i], col_names = TRUE,col_types = "cidc") if(!missing(cases)) colnames(data)[2:ncol(data)] <- paste0(colnames(data)[2:ncol(data)],"_",cases[i]) if(i == 1) df <- data if(i != 1) df <- merge(df, data, by=colnames(df)[1],all = TRUE) setTxtProgressBar(pb, i) } close(pb) } else if(grepl("Isoform Expression Quantification", data.type, ignore.case = TRUE)){ pb <- txtProgressBar(min = 0, max = length(files), style = 3) for (i in seq_along(files)) { data <- read_tsv(file = files[i], col_names = TRUE, col_types = c("ccidcc")) if(!missing(cases)) data$barcode <- cases[i] else data$file <- i if(i == 1) df <- data if(i != 1) df <- rbind(df,data) setTxtProgressBar(pb, i) } close(pb)
38297df4	}
045b7659	return(df)
178c3b9a	}
b2bc706f	read_gene_level_copy_number <- function( files, cases, summarizedExperiment = FALSE ){
045b7659	message("Reading Gene Level Copy Number files") gistic.df <- NULL gistic.list <- plyr::alply(files,1,.fun = function(file) { #message("Reading file: ", file) data <- read_tsv( file = file, col_names = TRUE, progress = TRUE, col_types = readr::cols() ) colnames(data)[-c(1:5)] <- paste0(cases[match(file,files)],"_", colnames(data)[-c(1:5)]) return(data) }) # Check if the data is in the same order stopifnot(all(unlist(gistic.list %>% map(function(y){all(y[,1:5] == gistic.list[[1]][,1:5])}) ))) # need to check if it works in all cases # tested for HTSeq - FPKM-UQ and Counts only df <- bind_cols( gistic.list[[1]][,1:5], # gene info gistic.list %>% map_dfc(.f = function(x) x[,6:8]) # copy number, min,max
dd5cb063	)
045b7659	if(summarizedExperiment) {
b2bc706f	se <- make_se_from_gene_level_copy_number(df, cases)
045b7659	return(se) } return(df)
dd5cb063	}
b2bc706f	make_se_from_gene_level_copy_number <- function(df, cases){
045b7659	message("Creating a SummarizedExperiment object") rowRanges <- GRanges( seqnames = df$chromosome, ranges = IRanges(start = df$start, end = df$end), gene_id = df$gene_id, gene_name = df$gene_name ) names(rowRanges) <- as.character(df$gene_id) colData <- colDataPrepare(cases) # We have three data columns for each files assays <- lapply( c("[^max\|min]_copy_number","min_copy_number", "max_copy_number"), function(x){ ret <- data.matrix(subset(df,select = grep(x,colnames(df)))) colnames(ret) <- cases rownames(ret) <- NULL ret }) names(assays) <- c("copy_number","min_copy_number", "max_copy_number") message("Available assays in SummarizedExperiment : \n => ",paste(names(assays), collapse = "\n => ")) rse <- SummarizedExperiment(assays = assays, rowRanges = rowRanges, colData = colData) return(rse)
7b7f18e0	}
c8c9c620	readGISTIC <- function(files, cases){
045b7659	message("Reading GISTIC file") gistic.df <- NULL gistic.list <- plyr::alply(files,1,.fun = function(file) { message("Reading file: ", file) data <- read_tsv( file = file, col_names = TRUE, progress = TRUE, col_types = readr::cols() ) aliquot <- colnames(data)[-c(1:3)] info <- splitAPICall( FUN = getBarcodefromAliquot, step = 20, items = aliquot ) barcode <- as.character(info$submitter_id)[match(aliquot,as.character(info$aliquot_id))] colnames(data)[-c(1:3)] <- barcode return(data) }) gistic.df <- gistic.list %>% join_all(by = c("Gene Symbol","Gene ID","Cytoband"), type='full') return(gistic.df)
c8c9c620	}
717cfd63	# Reads Copy Number Variation files to a data frame, basically it will rbind it
f3faeb86	#' @importFrom purrr map2_dfr
b2bc706f	read_copy_number_variation <- function(files, cases){
045b7659	message("Reading copy number variation files") col_types <- ifelse(any(grepl('ascat2', files)),"ccnnnnn","ccnnnd") purrr::map2_dfr( .x = files, .y = cases, .f = function(file,case) { data <- readr::read_tsv(file, col_names = TRUE, col_types = col_types); if(!missing(case)) data$Sample <- case data })
178c3b9a	}
57189bb2	# getBarcodeInfo(c("TCGA-A6-6650-01B"))
9d34d5f7	# getBarcodeInfo(c("DLBCL10508-sample"))
57189bb2	addFFPE <- function(df) {
045b7659	message("Add FFPE information. More information at: \n=> https://cancergenome.nih.gov/cancersselected/biospeccriteria \n=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html") barcode <- df$barcode patient <- df$patient ffpe.info <- NULL ffpe.info <- splitAPICall(FUN = getFFPE, step = 20, items = df$patient) df <- merge(df, ffpe.info,by.x = "sample", by.y = "submitter_id") df <- df[match(barcode,df$barcode),] return(df)
57189bb2	}
008efdf3	# getFFPE("TCGA-A6-6650") #' @importFrom plyr rbind.fill #' @importFrom httr content
57189bb2	getFFPE <- function(patient){
045b7659	baseURL <- "https://api.gdc.cancer.gov/cases/?" options.pretty <- "pretty=true" options.expand <- "expand=samples" option.size <- paste0("size=",length(patient))
b2bc706f	options.filter <- paste0( "filters=", URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), paste0('"',paste(patient,collapse = '","')), URLencode('"]}}]}') )
045b7659	url <- paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&")) json <- tryCatch( getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE), error = function(e) { message(paste("Error: ", e, sep = " ")) message("We will retry to access GDC again! URL:") #message(url) fromJSON(content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE) } ) results <- json$data$hits results <- rbind.fill(results$samples)[,c("submitter_id","is_ffpe")] return(results)
57189bb2	}
c8c9c620	getAliquot_ids <- function(barcode){
045b7659	baseURL <- "https://api.gdc.cancer.gov/cases/?" options.fields <- "fields=samples.portions.analytes.aliquots.aliquot_id,samples.portions.analytes.aliquots.submitter_id" options.pretty <- "pretty=true" option.size <- paste0("size=",length(barcode)) #message(paste(barcode,collapse = '","')) #message(paste0('"',paste(barcode,collapse = '","')))
b2bc706f	options.filter <- paste0( "filters=", URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), paste0('"',paste(barcode,collapse = '","')), URLencode('"]}}]}') )
045b7659	#message(paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&"))) url <- paste0(baseURL,paste(options.pretty,options.fields, option.size, options.filter, sep = "&")) json <- tryCatch( getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE), error = function(e) { message(paste("Error: ", e, sep = " ")) message("We will retry to access GDC again! URL:") #message(url) fromJSON(content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE) } ) results <- unlist(json$data$hits$samples) results.barcode <- grep("TCGA",results,value = TRUE) results.aliquot <- grep("TCGA",results,value = TRUE,invert = TRUE) df <- data.frame(results.aliquot,results.barcode) colnames(df) <- c("aliquot_id","barcode") return(df)
c8c9c620	}
e4b4e205	# getBarcodeInfo(c("TCGA-OR-A5K3-01A","C3N-00321-01")) # barcode is: sample_submitter_id
eb4267de	#' @importFrom dplyr bind_cols slice row_number #'
dc887359	getBarcodeInfo <- function(barcode) {
045b7659	baseURL <- "https://api.gdc.cancer.gov/cases/?" options.pretty <- "pretty=true" options.expand <- "expand=samples,project,diagnoses,diagnoses.treatments,annotations,family_histories,demographic,exposures" option.size <- paste0("size=",length(barcode)) options.filter <- paste0("filters=", URLencode('{"op":"or","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['), paste0('"',paste(barcode,collapse = '","')), URLencode('"]}},'), URLencode('{"op":"in","content":{"field":"submitter_sample_ids","value":['), paste0('"',paste(barcode,collapse = '","')), URLencode('"]}},'), URLencode('{"op":"in","content":{"field":"submitter_aliquot_ids","value":['), paste0('"',paste(barcode,collapse = '","')), URLencode('"]}}'), URLencode(']}') ) url <- paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&")) #message(url) json <- tryCatch( getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE), error = function(e) { message(paste("Error: ", e, sep = " ")) message("We will retry to access GDC again! URL:") #message(url) fromJSON(content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE) } ) results <- json$data$hits # no results if(length(results) == 0){ return(data.frame(barcode,stringsAsFactors = FALSE))
9bda1de1	}
045b7659	submitter_id <- results$submitter_id submitter_aliquot_ids <- results$submitter_aliquot_ids if(!is.null(results$samples)) { samples <- rbindlist(results$samples, fill = TRUE) samples <- samples[match(barcode,samples$submitter_id),]
9d34d5f7	samples$sample_submitter_id <- str_extract_all( samples$submitter_id,paste(barcode,collapse = "\|")) %>%
045b7659	unlist %>% as.character tryCatch({ samples$submitter_id <-
9d34d5f7	str_extract_all( samples$submitter_id, paste(c(submitter_id,barcode), collapse = "\|"), simplify = TRUE ) %>% as.character
045b7659	}, error = function(e){ samples$submitter_id <- submitter_id }) df <- samples[!is.na(samples$submitter_id),] suppressWarnings({ df[,c("updated_datetime","created_datetime")] <- NULL })
b6275ad3	}
045b7659	# We dont have the same cols for TCGA and TARGET so we need to check them if(!is.null(results$diagnoses)) { diagnoses <- rbindlist(lapply(results$diagnoses, function(x) if(is.null(x)) data.frame(NA) else x),fill = TRUE) diagnoses[,c("updated_datetime","created_datetime","state")] <- NULL if(any(grepl("submitter_id", colnames(diagnoses)))) {
9d34d5f7	diagnoses$submitter_id <- gsub("-diagnosis\|_diagnosis.*\|-DIAG\|diag-","", diagnoses$submitter_id)
045b7659	} else { diagnoses$submitter_id <- submitter_id } # this is required since the sample might not have a diagnosis if(!any(df$submitter_id %in% diagnoses$submitter_id)){ diagnoses$submitter_id <- NULL # The sample migth have different sample types # The diagnosis the same for each one of these samples # in that case we will have a 1 to mapping and binding will # not work. We need then to replicate diagnosis to each sample # and not each patient # Cases can be replicated with getBarcodeInfo(c("BA2691R","BA2577R","BA2748R")) if(nrow(diagnoses) < nrow(df)){ diagnoses <- plyr::ldply( 1:length(results$submitter_sample_ids), .fun = function(x){ diagnoses[x] %>% # replicate diagnoses the number of samples as.data.frame() %>% dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))}) } df <- dplyr::bind_cols(df %>% as.data.frame,diagnoses %>% as.data.frame) } else { df <- left_join(df, diagnoses, by = "submitter_id") }
9bda1de1	}
045b7659	if(!is.null(results$exposures)) {
b2bc706f	exposures <- rbindlist( lapply(results$exposures, function(x) if(is.null(x)) data.frame(NA) else x), fill = TRUE )
045b7659	exposures[,c("updated_datetime","created_datetime","state")] <- NULL if(any(grepl("submitter_id", colnames(exposures)))) {
9d34d5f7	exposures$submitter_id <- gsub("-exposure\|_exposure.*\|-EXP","", exposures$submitter_id)
045b7659	} else { exposures$submitter_id <- submitter_id } if(!any(df$submitter_id %in% exposures$submitter_id)){ exposures$submitter_id <- NULL df <- dplyr::bind_cols(df,exposures) } else { df <- left_join(df, exposures, by = "submitter_id") }
b6275ad3	}
045b7659	if(!is.null(results$demographic)) { demographic <- results$demographic demographic[,c("updated_datetime","created_datetime","state")] <- NULL if(any(grepl("submitter_id", colnames(demographic)))) {
b2bc706f	demographic$submitter_id <- gsub( "-demographic\|_demographic.*\|-DEMO\|demo-", "", results$demographic$submitter_id )
045b7659	} else { demographic$submitter_id <- submitter_id } if(!any(df$submitter_id %in% demographic$submitter_id)){ demographic$submitter_id <- NULL demographic$updated_datetime <- NULL demographic$created_datetime <- NULL # The sample migth have different sample types # The diagnosis the same for each one of these samples # in that case we will have a 1 to mapping and binding will # not work. We need then to replicate diagnosis to each sample # and not each patient # Cases can be replicated with getBarcodeInfo(c("BA2691R","BA2577R","BA2748R")) if(nrow(demographic) < nrow(df)){ demographic <- plyr::ldply( 1:length(results$submitter_sample_ids), .fun = function(x){ demographic[x,] %>% # replicate diagnoses the number of samples as.data.frame() %>%
b2bc706f	dplyr::slice( rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode))) })
045b7659	} df <- dplyr::bind_cols(df %>% as.data.frame,demographic) } else { df <- left_join(df,demographic, by = "submitter_id") }
9bda1de1	}
045b7659	df$bcr_patient_barcode <- df$submitter_id %>% as.character() projects.info <- results$project projects.info <- results$project[,grep("state",colnames(projects.info),invert = TRUE)] if(any(submitter_id %in% df$submitter_id)){ projects.info <- cbind("submitter_id" = submitter_id, projects.info) suppressWarnings({
b2bc706f	df <- left_join( df, projects.info, by = "submitter_id" )
045b7659	})
b6275ad3	} else {
045b7659	if(nrow(projects.info) < nrow(df)){ projects.info <- plyr::ldply( 1:length(results$submitter_sample_ids), .fun = function(x){ projects.info[x,] %>% # replicate diagnoses the number of samples as.data.frame() %>%
b2bc706f	dplyr::slice( rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode))) })
045b7659	} df <- dplyr::bind_cols(df,projects.info)
b6275ad3	}
045b7659	# Adding in the same order if(any(substr(barcode,1,str_length(df$submitter_id)) %in% df$submitter_id)){ df <- df[match(substr(barcode,1,str_length(df$sample_submitter_id)),df$sample_submitter_id),] # This line should not exists, but some patients does not have clinical data # case: TCGA-R8-A6YH" # this has been reported to GDC, waiting answers # So we will remove this NA cases df <- df[!is.na(df$submitter_id),]
7697beda	} else {
045b7659	idx <- sapply(substr(barcode,1,str_length(df$submitter_aliquot_ids) %>% max), FUN = function(x){ grep(x,df$submitter_aliquot_ids) }) df <- df[idx,]
38297df4	}
045b7659	# remove empty columns df <- df %>% as.data.frame() %>% dplyr::select(which(colSums(is.na(df)) < nrow(df))) return(df)
dc887359	}
cb8583c7	#' @title Prepare CEL files into an AffyBatch. #' @description Prepare CEL files into an AffyBatch. #' @param ClinData write #' @param PathFolder write #' @param TabCel write #' @examples #' \dontrun{ #' to add example #' } #' @export
86722ee7	#' @return Normalized Expression data from Affy eSets
cb8583c7	TCGAprepare_Affy <- function(ClinData, PathFolder, TabCel){
045b7659	if (!requireNamespace("affy", quietly = TRUE)) { stop("affy package is needed for this function to work. Please install it.", call. = FALSE) } if (!requireNamespace("Biobase", quietly = TRUE)) { stop("affy package is needed for this function to work. Please install it.", call. = FALSE) } affy_batch <- affy::ReadAffy(filenames = as.character(paste(TabCel$samples, ".CEL", sep = ""))) eset <- affy::rma(affy_batch) mat <- Biobase::exprs(eset) return(mat)
cb8583c7	}