R/prepare.R
178c3b9a
 #' @title Prepare GDC data
 #' @description
 #'   Reads the data downloaded and prepare it into an R object
 #' @param query A query for GDCquery function
 #' @param save Save result as RData object?
 #' @param save.filename Name of the file to be save if empty an automatic will be created
c1b5e11c
 #' @param directory Directory/Folder where the data was downloaded. Default: GDCdata
67c15b4c
 #' @param summarizedExperiment Create a summarizedExperiment? Default TRUE (if possible)
c1b5e11c
 #' @param remove.files.prepared Remove the files read? Default: FALSE
 #' This argument will be considered only if save argument is set to true
48fb8c4e
 #' @param add.gistic2.mut If a list of genes (gene symbol) is given, columns with gistic2 results from GDAC firehose (hg19)
 #' and a column indicating if there is or not mutation in that gene (hg38)
 #' (TRUE or FALSE - use the MAF file for more information)
 #' will be added to the sample matrix in the summarized Experiment object.
a58ac18c
 #' @param mutant_variant_classification List of mutant_variant_classification that will be
 #' consider a sample mutant or not. Default: "Frame_Shift_Del", "Frame_Shift_Ins",
 #' "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del",
 #' "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation"
178c3b9a
 #' @export
88ef1778
 #' @examples
093df0db
 #' \dontrun{
f22676b7
 #' query <- GDCquery(
 #'   project = "TCGA-KIRP",
 #'   data.category = "Simple Nucleotide Variation",
4413c90f
 #'   data.type = "Masked Somatic Mutation"
f22676b7
 #' )
88ef1778
 #' GDCdownload(query, method = "api", directory = "maf")
 #' maf <- GDCprepare(query, directory = "maf")
 #'
a77d212e
 #' }
178c3b9a
 #' @return A summarizedExperiment or a data.frame
ee9a77b7
 #' @importFrom  S4Vectors DataFrame
e7804dce
 #' @importFrom SummarizedExperiment metadata<-
eba132ca
 #' @importFrom data.table setcolorder setnames
9a62fdd6
 #' @importFrom GenomicRanges GRanges
 #' @importFrom IRanges IRanges
28ae53ec
 #' @importFrom purrr map_chr
02c318ae
 #' @author Tiago Chedraoui Silva
afcb2a28
 GDCprepare <- function(
045b7659
         query,
         save = FALSE,
         save.filename,
         directory = "GDCdata",
         summarizedExperiment = TRUE,
         remove.files.prepared = FALSE,
         add.gistic2.mut = NULL,
         mutant_variant_classification = c(
             "Frame_Shift_Del",
             "Frame_Shift_Ins",
             "Missense_Mutation",
             "Nonsense_Mutation",
             "Splice_Site",
             "In_Frame_Del",
             "In_Frame_Ins",
             "Translation_Start_Site",
             "Nonstop_Mutation"
919dc166
         )
045b7659
 ){
 
     isServeOK()
     if(missing(query)) stop("Please set query parameter")
 
b2bc706f
     test.duplicated.cases <- (
         any(
             duplicated(query$results[[1]]$cases)) & # any duplicated
             !(query$data.type %in% c(
                 "Clinical data",
                 "Protein expression quantification",
                 "Raw intensities",
                 "Masked Intensities",
                 "Clinical Supplement",
                 "Masked Somatic Mutation",
                 "Biospecimen Supplement"
             )
             )
     )
045b7659
 
 
     if(test.duplicated.cases) {
         dup <- query$results[[1]]$cases[duplicated(query$results[[1]]$cases)]
         cols <- c("tags","cases","experimental_strategy","analysis_workflow_type")
         cols <- cols[cols %in% colnames(query$results[[1]])]
         dup <- query$results[[1]][query$results[[1]]$cases %in% dup,cols]
         dup <- dup[order(dup$cases),]
         print(knitr::kable(dup))
         stop("There are samples duplicated. We will not be able to prepare it")
     }
 
     if (!save & remove.files.prepared) {
         stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost")
     }
b2bc706f
 
     # We save the files in project/data.category/data.type/file_id/file_name
045b7659
     files <- file.path(
b2bc706f
         query$results[[1]]$project,
afcb2a28
         gsub(" ","_",query$results[[1]]$data_category),
         gsub(" ","_",query$results[[1]]$data_type),
045b7659
         gsub(" ","_",query$results[[1]]$file_id),
afcb2a28
         gsub(" ","_",query$results[[1]]$file_name)
38297df4
     )
045b7659
 
     files <- file.path(directory, files)
 
     # For IDAT prepare since we need to put all IDATs in the same folder the code below will not work
     # a second run
     if (!all(file.exists(files))) {
b2bc706f
         # We have to check we moved the files
         if (
             unique(query$results[[1]]$data_type) == "Masked Intensities" |
             unique(query$results[[1]]$data_category) == "Raw microarray data"
         ){
 
045b7659
             files.idat <- file.path(
b2bc706f
                 query$results[[1]]$project,
045b7659
                 gsub(" ","_",query$results[[1]]$data_category),
                 gsub(" ","_",query$results[[1]]$data_type),
                 gsub(" ","_",query$results[[1]]$file_name)
             )
b2bc706f
 
045b7659
             files.idat <- file.path(directory, files.idat)
             if (!all(file.exists(files) | file.exists(files.idat))) {
                 stop(
                     paste0(
                         "I couldn't find all the files from the query. ",
                         "Please check if the directory parameter is right ",
                         "or `GDCdownload` downloaded the samples."
                     )
                 )
             }
         } else {
             stop(
                 paste0(
                     "I couldn't find all the files from the query. ",
                     "Please check if the directory parameter is right ",
                     "or `GDCdownload` downloaded the samples."
                 )
             )
         }
178c3b9a
     }
045b7659
 
     cases <- ifelse(
c4457d9c
         grepl("TCGA|TARGET|CGCI-HTMCP-CC|CPTAC-2|BEATAML1.0-COHORT",query$results[[1]]$project %>% unlist()),
045b7659
         query$results[[1]]$cases,
         query$results[[1]]$sample.submitter_id
     )
 
     if (grepl("Transcriptome Profiling", query$data.category, ignore.case = TRUE)){
 
         if(unique(query$results[[1]]$experimental_strategy) == "scRNA-Seq"){
             #if (grepl("Single Cell Analysis", unique(query$results[[1]]$data_type), ignore.case = TRUE)){
 
             data <- readSingleCellAnalysis(
                 files = files,
                 data_format = unique(query$results[[1]]$data_format),
                 workflow.type = unique(query$results[[1]]$analysis_workflow_type),
                 cases = cases
             )
             return(data)
         } else {
             data <- readTranscriptomeProfiling(
                 files = files,
                 data.type = ifelse(!is.na(query$data.type),  as.character(query$data.type),  unique(query$results[[1]]$data_type)),
                 workflow.type = unique(query$results[[1]]$analysis_workflow_type),
                 cases = cases,
                 summarizedExperiment
             )
         }
     } else if(grepl("Copy Number Variation",query$data.category,ignore.case = TRUE)) {
         if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number Scores") {
             data <- readGISTIC(files, query$results[[1]]$cases)
         } else if (unique(query$results[[1]]$data_type) == "Gene Level Copy Number") {
b2bc706f
             data <- read_gene_level_copy_number(
                 files = files,
                 cases = query$results[[1]]$sample.submitter_id,
                 summarizedExperiment = summarizedExperiment
             )
045b7659
         } else {
b2bc706f
             data <- read_copy_number_variation(
                 files = files, cases = query$results[[1]]$cases
             )
045b7659
         }
     }  else if (grepl("Methylation Beta Value",unique(query$results[[1]]$data_type), ignore.case = TRUE)) {
         data <- readDNAmethylation(
             files = files,
             cases = cases,
             summarizedExperiment = summarizedExperiment,
c8404f5f
             platform =  unique(query$results[[1]]$platform)
38297df4
         )
045b7659
     }  else if (grepl("Raw intensities|Masked Intensities",query$data.type, ignore.case = TRUE)) {
         # preparing IDAT files
         data <- readIDATDNAmethylation(
             files = files,
             barcode = cases,
             summarizedExperiment = summarizedExperiment,
c8404f5f
             platform =  unique(query$results[[1]]$platform)
38297df4
         )
b2bc706f
     } else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) {
045b7659
 
         data <- readProteomeProfiling(files, cases = cases)
 
b2bc706f
     } else if (grepl("Protein expression",query$data.category,ignore.case = TRUE)) {
045b7659
 
         data <- readProteinExpression(files, cases = cases)
b2bc706f
 
         if (summarizedExperiment) {
045b7659
             message("SummarizedExperiment not implemented, if you need samples metadata use the function TCGAbiolinks:::colDataPrepare")
         }
 
     }  else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) {
 
b2bc706f
         if (grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){
045b7659
             data <- readSimpleNucleotideVariationMaf(files)
         }
 
     }  else if (grepl("Clinical|Biospecimen", query$data.category, ignore.case = TRUE)){
b2bc706f
         data <- read_clinical(files, query$data.type, cases = cases)
045b7659
         summarizedExperiment <- FALSE
     } else if (grepl("Gene expression",query$data.category,ignore.case = TRUE)) {
         if (query$data.type == "Gene expression quantification")
b2bc706f
             data <- read_gene_expression_quantification(
045b7659
                 files = files,
                 cases = cases,
                 summarizedExperiment = summarizedExperiment,
c8404f5f
                 genome = "hg38",
045b7659
                 experimental.strategy = unique(query$results[[1]]$experimental_strategy)
             )
 
         if (query$data.type == "miRNA gene quantification")
b2bc706f
             data <- read_gene_expression_quantification(
045b7659
                 files = files,
                 cases = cases,
                 summarizedExperiment = FALSE,
c8404f5f
                 genome = "hg38",
045b7659
                 experimental.strategy = unique(query$results[[1]]$experimental_strategy)
             )
 
         if (query$data.type == "miRNA isoform quantification")
             data <- readmiRNAIsoformQuantification(
                 files = files,
                 cases = query$results[[1]]$cases
             )
 
         if (query$data.type == "Isoform expression quantification")
             data <- readIsoformExpressionQuantification(files = files, cases = cases)
 
         if (query$data.type == "Exon quantification")
             data <- readExonQuantification(
                 files = files,
                 cases = cases,
                 summarizedExperiment = summarizedExperiment
             )
 
38297df4
     }
045b7659
     # Add data release to object
     if (summarizedExperiment & !is.data.frame(data)) {
         metadata(data) <- list("data_release" = getGDCInfo()$data_release)
38297df4
     }
045b7659
 
 
     if ((!is.null(add.gistic2.mut)) & summarizedExperiment) {
         message("=> Adding GISTIC2 and mutation information....")
         genes <- tolower(levels(EAGenes$Gene))
         if (!all(tolower(add.gistic2.mut) %in% genes)) {
             message(
                 paste("These genes were not found:\n",
                       paste(add.gistic2.mut[! tolower(add.gistic2.mut) %in% genes],collapse = "\n=> ")
                 )
             )
         }
         add.gistic2.mut <- add.gistic2.mut[tolower(add.gistic2.mut) %in% tolower(genes)]
         if (length(add.gistic2.mut) > 0){
             info <- colData(data)
             for(i in unlist(query$project)){
                 info <- get.mut.gistc.information(
                     info,
                     i,
                     add.gistic2.mut,
                     mutant_variant_classification = mutant_variant_classification
                 )
             }
             colData(data) <- info
         }
     }
     if("samples" %in% colnames(data)){
         if(any(duplicated(data$sample))) {
             message("Replicates found.")
             if(any(data$is_ffpe)) message("FFPE should be removed. You can modify the data with the following command:\ndata <- data[,!data$is_ffpe]")
             print(as.data.frame(colData(data)[data$sample %in% data$sample[duplicated(data$sample)],c("is_ffpe"),drop = FALSE]))
         }
38297df4
     }
045b7659
 
 
     if(save){
         if(missing(save.filename) & !missing(query)) save.filename <- paste0(query$project,gsub(" ","_", query$data.category),gsub(" ","_",date()),".RData")
         message(paste0("=> Saving file: ",save.filename))
         save(data, file = save.filename)
         message("=> File saved")
 
         # save is true, due to the check in the beggining of the code
         if(remove.files.prepared){
             # removes files and empty directories
b2bc706f
             remove_files_recursively(files)
045b7659
         }
     }
 
     return(data)
178c3b9a
 }
 
b2bc706f
 remove_files_recursively <- function(files){
045b7659
     files2rm <- dirname(files)
     unlink(files2rm,recursive = TRUE)
     files2rm <- dirname(files2rm) # data category
b2bc706f
     if(length(list.files(files2rm)) == 0) remove_files_recursively(files2rm)
c1b5e11c
 }
26eb7e14
 
 
b2bc706f
 read_clinical <- function(files, data.type, cases){
045b7659
     if(data.type == "Clinical data"){
         suppressMessages({
             ret <- plyr::alply(files,.margins = 1,readr::read_tsv, .progress = "text")
         })
         names(ret) <- gsub("nationwidechildrens.org_","",gsub(".txt","",basename(files)))
     } else if(data.type %in% c("Clinical Supplement","Biospecimen Supplement")){
         ret <- plyr::alply(files,.margins = 1,function(f) {
             readr::read_tsv(f,col_types = readr::cols())
         }, .progress = "text")
         names(ret) <- gsub("nationwidechildrens.org_","",gsub(".txt","",basename(files)))
     }
     return(ret)
5dab05e3
 }
cc200045
 
f1d96802
 
f22676b7
 readSingleCellAnalysis <- function(
045b7659
         files = files,
         data_format = NULL,
         workflow.type = NULL,
         cases = cases
f22676b7
 ) {
045b7659
 
     if(data_format == "MEX" & workflow.type == "CellRanger - 10x Filtered Counts"){
         check_package("Seurat")
         ret <- plyr::llply(files,.fun = function(f){
             untar(tarfile = f,exdir = dirname(f))
             Seurat::Read10X(data.dir = gsub("\\.tar\\.gz","",f))
         },.progress = "time")
         names(ret) <- cases
     }
 
     if(data_format == "MEX" & workflow.type == "CellRanger - 10x Raw Counts"){
         ret <- plyr::llply(files,.fun = function(f){
             # uncompress raw file
             untar(tarfile = f,exdir = dirname(f))
             Read10X(data.dir = gsub("\\.tar\\.gz","",f))
         },.progress = "time")
         names(ret) <- cases
     }
     # TSV files
     if(data_format == "TSV"){
         ret <- plyr::llply(files,.fun = function(f){
             readr::read_tsv(f)
         },.progress = "time")
         names(ret) <- cases
     }
 
     if(data_format == "HDF5"){
 
         stop("We are not preparing loom files")
         # check_package("SeuratDisk")
         # check_package("Seurat")
         # ret <- SeuratDisk::Connect(filename = files, mode = "r")
         # ret <- Seurat::as.Seurat(ret)
         print(files)
     }
 
     # HDF5
     return(ret)
f22676b7
 }
 
9e56ad43
 #' @importFrom tidyr separate
b2bc706f
 readExonQuantification <- function (
         files,
         cases,
         summarizedExperiment = TRUE
 ){
045b7659
     pb <- txtProgressBar(min = 0, max = length(files), style = 3)
     assay.list <- NULL
 
     for (i in seq_along(files)) {
         data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE)
 
         if(!missing(cases)) {
             assay.list <- gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)])
             # We will use this because there might be more than one col for each samples
             setnames(data,colnames(data)[2:ncol(data)],
                      paste0(gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)]),"_",cases[i]))
         }
         if (i == 1) {
             df <- data
         } else {
             df <- merge(df, data, by=colnames(data)[1], all = TRUE)
         }
b2bc706f
 
045b7659
         setTxtProgressBar(pb, i)
f1d96802
     }
045b7659
     setDF(df)
     rownames(df) <- df[,1]
     df <- df %>% separate(exon,into = c("seqnames","coordinates","strand"),sep = ":") %>%
         separate(coordinates,into = c("start","end"),sep = "-")
b2bc706f
 
045b7659
     if(summarizedExperiment) {
         suppressWarnings({
             assays <- lapply(assay.list, function (x) {
                 return(data.matrix(subset(df, select = grep(x,colnames(df),ignore.case = TRUE))))
             })
         })
         names(assays) <- assay.list
b2bc706f
         regex <- paste0(
             "[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}",
             "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}"
         )
045b7659
         samples <- na.omit(unique(str_match(colnames(df),regex)[,1]))
         colData <-  colDataPrepare(samples)
         assays <- lapply(assays, function(x){
             colnames(x) <- NULL
             rownames(x) <- NULL
             return(x)
         })
         rowRanges <- makeGRangesFromDataFrame(df)
         message("Available assays in SummarizedExperiment : \n  => ",paste(names(assays), collapse = "\n  => "))
         rse <- SummarizedExperiment(
             assays = assays,
             rowRanges = rowRanges,
             colData = colData
         )
         return(rse)
f1d96802
     }
045b7659
     return(df)
 
f1d96802
 }
cc200045
 readIsoformExpressionQuantification <- function (files, cases){
045b7659
     pb <- txtProgressBar(min = 0, max = length(files), style = 3)
 
     for (i in seq_along(files)) {
         data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE)
 
         if(!missing(cases)) {
             assay.list <- gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)])
             # We will use this because there might be more than one col for each samples
             setnames(data,colnames(data)[2:ncol(data)],
                      paste0(gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)]),"_",cases[i]))
         }
         if (i == 1) {
             df <- data
         } else {
             df <- merge(df, data, by=colnames(data)[1], all = TRUE)
         }
         setTxtProgressBar(pb, i)
38297df4
     }
045b7659
     setDF(df)
     rownames(df) <- df[,1]
     df[,1] <- NULL
     return(df)
 
cc200045
 }
93f1c154
 readmiRNAIsoformQuantification <- function (files, cases){
045b7659
     pb <- txtProgressBar(min = 0, max = length(files), style = 3)
 
     for (i in seq_along(files)) {
         data <- fread(files[i], header = TRUE, sep = "\t", stringsAsFactors = FALSE)
         data$barcode <- cases[i]
         if (i == 1) {
             df <- data
         } else {
             df <- rbind(df, data)
         }
         setTxtProgressBar(pb, i)
93f1c154
     }
045b7659
     setDF(df)
 
93f1c154
 }
3fc605c6
 
88ef1778
 readSimpleNucleotideVariationMaf <- function(files){
045b7659
 
3fc605c6
     ret <- files |>
         purrr::map_dfr(.f = function(x) {
             tab <- readr::read_tsv(
                 x,
                 show_col_types = FALSE,
045b7659
                 comment = "#",
3fc605c6
                 col_types = readr::cols(
                     SOMATIC =  col_character(),
                     PUBMED = col_character(),
                     miRNA = col_character(),
                     HGVS_OFFSET = col_integer(),
                     PHENO = col_character(),
045b7659
                     Entrez_Gene_Id = col_integer(),
                     Start_Position = col_integer(),
                     End_Position = col_integer(),
                     t_depth = col_integer(),
                     t_ref_count = col_integer(),
                     t_alt_count = col_integer(),
                     n_depth = col_integer(),
                     TRANSCRIPT_STRAND = col_integer(),
                     PICK = col_integer(),
                     TSL = col_integer(),
c1b9dfef
                     Allele =  col_character(),
36436f05
                     Tumor_Seq_Allele1 =  col_character(),
                     Reference_Allele = col_character(),
c1b9dfef
                     Tumor_Seq_Allele2 = col_character(),
3fc605c6
                     DISTANCE = col_integer()
                 ))
 
             # empty MAF file
             # https://portal.gdc.cancer.gov/files/7917fcbe-cb66-447d-8ea8-3a324feee3fa
             if(nrow(tab) == 0) {return (NULL)}
             tab
045b7659
         })
3fc605c6
 
045b7659
     return(ret)
88ef1778
 }
 
b2bc706f
 read_gene_expression_quantification <- function(
045b7659
         files,
         cases,
         genome = "hg19",
         summarizedExperiment = TRUE,
         experimental.strategy,
         platform
e84eacd4
 ){
045b7659
     skip <- unique((ifelse(experimental.strategy == "Gene expression array",1,0)))
 
     if(length(skip) > 1) stop("It is not possible to handle those different platforms together")
 
     print.header(paste0("Reading ", length(files)," files"),"subsection")
     ret <- plyr::alply(
         .data = seq_along(files),
         .margins = 1,
         .fun = function(i,cases){
 
             data <- fread(
                 input = files[i],
                 header = TRUE,
                 sep = "\t",
                 stringsAsFactors = FALSE,
                 skip = skip
             )
 
             if(!missing(cases)) {
                 assay.list <<- gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)])
                 # We will use this because there might be more than one col for each samples
                 setnames(
                     data,
                     colnames(data)[2:ncol(data)],
                     paste0(gsub(" |\\(|\\)|\\/","_",colnames(data)[2:ncol(data)]),"_",cases[i])
                 )
             }
             data
         },.progress = "time",cases = cases)
 
     print.header(paste0("Merging ", length(files)," files"),"subsection")
 
     # Just check if the data is in the same order, since we will not merge
     # the data frames to save memory
     stopifnot(all(unlist(ret %>% map(function(y){all(y[,1] ==  ret[[1]][,1])}) )))
 
     # need to check if it works in all cases
     df <- ret %>% map( ~ (.x %>% dplyr::select(-1))) %>% bind_cols()
     df <- bind_cols(ret[[1]][,1],df)
 
     if (summarizedExperiment) {
b2bc706f
         df <- make_se_from_gene_exoression_quantification(df, assay.list, genome = genome)
045b7659
     } else {
         rownames(df) <- df$gene_id
         df$gene_id <- NULL
     }
     return(df)
14236a24
 }
e84eacd4
 
 
b2bc706f
 make_se_from_gene_exoression_quantification <- function(
045b7659
         df,
         assay.list,
         genome = "hg19"
18fd2879
 ){
045b7659
 
     # Access genome information to create SE
     gene.location <- get.GRCh.bioMart(genome)
 
     if(all(grepl("\\|",df[[1]]))){
         aux <- strsplit(df$gene_id,"\\|")
         GeneID <- unlist(lapply(aux,function(x) x[2]))
         df$entrezgene_id <- as.numeric(GeneID)
         gene.location <- gene.location[!duplicated(gene.location$entrezgene_id),]
         df <- merge(df, gene.location, by = "entrezgene_id")
     } else {
         df$external_gene_name <- as.character(df[[1]])
         df <- merge(df, gene.location, by = "external_gene_name")
     }
 
 
     if("transcript_id" %in% assay.list){
         rowRanges <- GRanges(
             seqnames = paste0("chr", df$chromosome_name),
             ranges = IRanges(start = df$start_position,
                              end = df$end_position),
             strand = df$strand,
             gene_id = df$external_gene_name,
             entrezgene = df$entrezgene_id,
             ensembl_gene_id = df$ensembl_gene_id,
             transcript_id = subset(df, select = 5)
18fd2879
         )
045b7659
         names(rowRanges) <- as.character(df$gene_id)
         assay.list <- assay.list[which(assay.list != "transcript_id")]
     } else {
         rowRanges <- GRanges(
             seqnames = paste0("chr", df$chromosome_name),
             ranges = IRanges(
                 start = df$start_position,
                 end = df$end_position
             ),
             strand = df$strand,
             gene_id = df$external_gene_name,
             entrezgene = df$entrezgene_id,
             ensembl_gene_id = df$ensembl_gene_id
         )
         names(rowRanges) <- as.character(df$external_gene_name)
     }
 
     suppressWarnings({
         assays <- lapply(assay.list, function(x) {
             return(
                 data.matrix(
                     subset(df, select = grep(x,colnames(df),ignore.case = TRUE))
                 )
             )
         })
     })
 
     names(assays) <- assay.list
     regex <- paste0("[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}",
                     "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}")
     samples <- na.omit(unique(str_match(colnames(df),regex)[,1]))
     colData <-  colDataPrepare(samples)
 
     assays <- lapply(assays, function(x){
         colnames(x) <- NULL
         rownames(x) <- NULL
         return(x)
02d1bac3
     })
045b7659
 
     message("Available assays in SummarizedExperiment : \n  => ",paste(names(assays), collapse = "\n  => "))
     rse <- SummarizedExperiment(
         assays = assays,
         rowRanges = rowRanges,
         colData = colData
     )
     return(rse)
14236a24
 }
178c3b9a
 
ad5fd34d
 
 #' @importFrom downloader download
 #' @importFrom S4Vectors DataFrame
8b67c89a
 makeSEFromDNAMethylationMatrix <- function(
045b7659
         betas,
         genome = "hg38",
         met.platform = c(
             "Illumina Human Methylation 450",
             "Illumina Human Methylation 27",
             "Illumina Methylation Epic"
         )
38297df4
 ) {
045b7659
     message("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-")
     message("Creating a SummarizedExperiment from DNA methylation input")
 
     # Instead of looking on the size, it is better to set it as a argument as the annotation is different
     annotation <-   getMetPlatInfo(platform = met.platform, genome = genome)
 
     rowRanges <- annotation[names(annotation) %in% rownames(betas),,drop = FALSE]
 
5b7bcdb4
     colData <- tryCatch({
         colDataPrepare(colnames(betas))
     }, error = function(e){
         DataFrame(samples = colnames(betas))
     })
045b7659
     betas <- betas[rownames(betas) %in% names(rowRanges),,drop = FALSE]
     betas <- betas[names(rowRanges),,drop = FALSE]
     assay <- data.matrix(betas)
 
     betas <- SummarizedExperiment(
         assays = assay,
         rowRanges = rowRanges,
         colData = colData
     )
     return(betas)
ad5fd34d
 }
 
7ab7fc99
 makeSEfromDNAmethylation <- function(df, probeInfo = NULL){
045b7659
     if(is.null(probeInfo)) {
         rowRanges <- GRanges(
             seqnames = paste0("chr", df$Chromosome),
             ranges = IRanges(start = df$Genomic_Coordinate,
                              end = df$Genomic_Coordinate),
             probeID = df$Composite.Element.REF,
             Gene_Symbol = df$Gene_Symbol
         )
 
         names(rowRanges) <- as.character(df$Composite.Element.REF)
         colData <-  colDataPrepare(colnames(df)[5:ncol(df)])
         assay <- data.matrix(subset(df,select = c(5:ncol(df))))
     } else {
         rowRanges <- makeGRangesFromDataFrame(probeInfo, keep.extra.columns = TRUE)
         colData <-  colDataPrepare(colnames(df)[(ncol(probeInfo) + 1):ncol(df)])
         assay <- data.matrix(subset(df,select = c((ncol(probeInfo) + 1):ncol(df))))
     }
     colnames(assay) <- rownames(colData)
     rownames(assay) <- as.character(df$Composite.Element.REF)
     rse <- SummarizedExperiment(assays = assay, rowRanges = rowRanges, colData = colData)
178c3b9a
 }
 
18fd2879
 readIDATDNAmethylation <- function(
045b7659
         files,
         barcode,
         summarizedExperiment,
c8404f5f
         platform
18fd2879
 ) {
045b7659
 
     check_package("sesame")
 
     # Check if moved files would be moved outside of scope folder, if so, path doesn't change
c8404f5f
     moved.files <- sapply(files,USE.NAMES = FALSE,function(x){
045b7659
         if (grepl("Raw_intensities|Masked_Intensities",dirname(dirname(x)))) {
             return(file.path(dirname(dirname(x)), basename(x)))
         }
         return(x)
     })
 
     # for each file move it to upper parent folder if necessary
     plyr::a_ply(files, 1,function(x){
         if (grepl("Raw_intensities|Masked_Intensities",dirname(dirname(x)))) {
             tryCatch(
                 move(x,
                      file.path(dirname(dirname(x)), basename(x)),
                      keep.copy = FALSE
                 ),
                 error = function(e){
                 })
         }
     })
 
     samples <- unique(gsub("_Grn.idat|_Red.idat","",moved.files))
     message("Processing  IDATs with Sesame - http://bioconductor.org/packages/sesame/")
     message("Running opensesame - applying quality masking and nondetection masking (threshold P-value 0.05)")
     message("Please cite: doi: 10.1093/nar/gky691 and 10.1093/nar/gkt090")
     message("This might take a while....")
     betas <- sesame::openSesame(samples)  %>% as.matrix
     barcode <- unique(data.frame("file" = gsub("_Grn.idat|_Red.idat","",basename(moved.files)), "barcode" = barcode))
     colnames(betas) <- barcode$barcode[match(basename(samples),barcode$file)]
 
     if (summarizedExperiment) {
 
         betas <- makeSEFromDNAMethylationMatrix(
             betas = betas,
c8404f5f
             genome ="hg38",
045b7659
             met.platform = platform
         )
         colData(betas) <- DataFrame(colDataPrepare(colnames(betas)))
38297df4
     }
045b7659
     return(betas)
 
ad5fd34d
 }
 
178c3b9a
 # We will try to make this function easier to use this function in its own data
 # In case it is not TCGA I should not consider that there is a barcode in the header
 # Instead the user should be able to add the names to his data
 # The only problem is that the data from the user will not have all the columns
 # TODO: Improve this function to be more generic as possible
0d7a13b9
 #' @importFrom GenomicRanges makeGRangesFromDataFrame
 #' @importFrom tibble as_data_frame
7fe66781
 #' @importFrom data.table fread
acdc3fa8
 readDNAmethylation <- function(
045b7659
         files,
         cases,
         summarizedExperiment = TRUE,
c8404f5f
         platform
acdc3fa8
 ){
045b7659
     if(length(platform) > 1){
 
         print(knitr::kable(platform))
         stop("More than one DNA methylation platform found. Only one is accepted")
38297df4
     }
045b7659
     if (missing(cases)) cases <- NULL
 
 
     if (grepl("OMA00",platform)) {
         pb <- txtProgressBar(min = 0, max = length(files), style = 3)
         for (i in seq_along(files)) {
             data <- fread(
                 files[i],
                 header = TRUE,
                 sep = "\t",
                 stringsAsFactors = FALSE,
                 skip = 1,
                 na.strings = "N/A",
                 colClasses = c(
                     "character", # Composite Element REF
                     "numeric" # # beta value
                 )
             )
             setnames(data,gsub(" ", "\\.", colnames(data)))
             if (!is.null(cases)) setnames(data,2,cases[i])
             if (i == 1) {
                 df <- data
             } else {
                 df <- merge(df, data, by = "Composite.Element.REF")
             }
             setTxtProgressBar(pb, i)
         }
         setDF(df)
         rownames(df) <- df$Composite.Element.REF
         df$Composite.Element.REF <- NULL
     } else if (all(grepl("methylation_array.sesame.level3betas", files))){
         # methylation_array.sesame.level3betas has only two columns with not header
         print.header(paste0("Reading ", length(files)," files"),"subsection")
 
         x <- plyr::alply(files,1, function(f) {
             data <- fread(
                 f,
                 header = FALSE,
                 sep = "\t",
                 stringsAsFactors = FALSE,
                 skip = 0,
                 colClasses = c(
                     "character", # CpG
                     "numeric" # beta value
                 )
             )
             setnames(data,gsub(" ", "\\.", colnames(data)))
             if (!is.null(cases)) setnames(data,2,cases[which(f == files)])
         }, .progress = "time")
 
         print.header(paste0("Merging ", length(files)," files"),"subsection")
 
         # Just check if the data is in the same order, since we will not merge
         # the data frames to save memory
93fd0bdc
         # C3N-01362-01 has less probes than C3L-00913-03 due to version of the array
         # CPTAC cohort: 55 samples with EPIC v1 and 1808  with EPIC V2
         # EPIC v1 has less probes than EPIC V2
 
         # In case we have multiple versions of the array we will
         # 1. Check the names of all of the same nrow
         # 2. Bind the ones with same nrow
         # 3. Full join the two objects
         nrow_vec <- purrr::map_int(x,nrow) # number of rows each file
         nrow_vec_numbers <- nrow_vec %>% unique # unique number of rows
 
         for(nb_rows in nrow_vec_numbers){
             aux <- x[which(nrow_vec == nb_rows)]
             stopifnot(all(unlist(aux %>% map(function(y){all(y[,1]$V1 ==  aux[[1]][,1]$V1)}) )))
         }
045b7659
 
93fd0bdc
         df <- map(nrow_vec_numbers,.f = function(nb_rows){
             file_equal_probes <- x[which(nrow_vec == nb_rows)]
             df <- file_equal_probes %>% map_df(2)
             colnames(df) <- file_equal_probes %>% map_chr(.f = function(y) colnames(y)[2])
             df$V1 <- file_equal_probes[[1]]$V1
84cf1d0b
             if (any(duplicated(colnames(df)))){
                 message("oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo")
                 message("Duplicated samples names were found. Adding _rep suffix to name")
                 message("oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo")
             }
             colnames(df)[duplicated(colnames(df))] <- paste0(colnames(df)[duplicated(colnames(df))],"_rep")
93fd0bdc
             df
         }) %>% purrr::reduce(dplyr::full_join,by = "V1") %>% as.data.frame()
         rownames(df) <- df$V1
         df$V1 <- NULL
045b7659
         df <- data.matrix(df)
 
         if (summarizedExperiment) {
 
             df <- makeSEFromDNAMethylationMatrix(
                 betas = df,
c8404f5f
                 genome = "hg38",
045b7659
                 met.platform = platform
             )
         }
 
38297df4
     } else {
045b7659
         skip <- ifelse(all(grepl("hg38",files)), 0,1)
         colClasses <- NULL
         if (!all(grepl("hg38",files))) {
             colClasses <- c(
                 "character", # Composite Element REF
                 "numeric",   # beta value
                 "character", # Gene symbol
                 "character", # Chromosome
                 "integer"
             )
         }
 
 
         x <- plyr::alply(files,1, function(f) {
             data <- fread(
                 f,
                 header = TRUE,
                 sep = "\t",
                 stringsAsFactors = FALSE,
                 skip = skip,
                 colClasses = colClasses
             )
             setnames(data,gsub(" ", "\\.", colnames(data)))
             if (!is.null(cases)) setnames(data,2,cases[which(f == files)])
             setcolorder(data,c(1, 3:ncol(data), 2))
         }, .progress = "time")
 
 
         print.header(paste0("Merging ", length(files)," files"),"subsection")
 
         # Just check if the data is in the same order, since we will not merge
         # the data frames to save memory
         stopifnot(all(unlist(x %>% map(function(y){all(y[,1] ==  x[[1]][,1])}) )))
 
         # if hg38 we have 10 columns with probe metadata
         # if hg19 we have 4 columns with probe metadata
         idx.dnam <- grep("TCGA",colnames(x[[1]]))
         df <- x %>%  map_df(idx.dnam)
         colnames(df) <- x %>%  map_chr(.f = function(y) colnames(y)[idx.dnam])
         df <- bind_cols(x[[1]][,1:(idx.dnam-1)],df)
 
         if (summarizedExperiment) {
             if(skip == 0) {
                 df <- makeSEfromDNAmethylation(
                     df,
                     probeInfo = data.frame(df)[,grep("TCGA",colnames(df),invert = TRUE)]
                 )
             } else {
                 df <- makeSEfromDNAmethylation(df)
             }
         } else {
             setDF(df)
             rownames(df) <- df$Composite.Element.REF
             df$Composite.Element.REF <- NULL
         }
38297df4
     }
045b7659
     return(df)
178c3b9a
 }
 
d568a61d
 # Barcode example MMRF_1358_1_BM_CD138pos_T1_TSMRU_L02337
 colDataPrepareMMRF <- function(barcode){
045b7659
     DataFrame(
         barcode = barcode,
         sample = barcode,
         patient = substr(barcode,1,9)
     )
d568a61d
 }
 
c6c5016d
 colDataPrepareTARGET <- function(barcode){
dc73efdc
 
 
045b7659
     message("Adding description to TARGET samples")
     tissue.code <- c(
         '01',
         '02',
         '03',
         '04',
         '05',
         '06',
         '07',
         '08',
         '09',
         '10',
         '11',
         '12',
         '13',
         '14',
         '15',
         '16',
         '17',
         '20',
         '40',
         '41',
         '42',
         '50',
         '60',
         '61',
dc73efdc
         '85',
045b7659
         '99'
     )
 
     definition <- c(
         "Primary solid Tumor", # 01
         "Recurrent Solid Tumor", # 02
         "Primary Blood Derived Cancer - Peripheral Blood", # 03
         "Recurrent Blood Derived Cancer - Bone Marrow", # 04
         "Additional - New Primary", # 05
         "Metastatic", # 06
         "Additional Metastatic", # 07
         "Tissue disease-specific post-adjuvant therapy", # 08
         "Primary Blood Derived Cancer - Bone Marrow", # 09
         "Blood Derived Normal", # 10
         "Solid Tissue Normal",  # 11
         "Buccal Cell Normal",   # 12
         "EBV Immortalized Normal", # 13
         "Bone Marrow Normal", # 14
         "Fibroblasts from Bone Marrow Normal", # 15
         "Mononuclear Cells from Bone Marrow Normal", # 16
         "Lymphatic Tissue Normal (including centroblasts)", # 17
         "Control Analyte", # 20
         "Recurrent Blood Derived Cancer - Peripheral Blood", # 40
         "Blood Derived Cancer- Bone Marrow, Post-treatment", # 41
         "Blood Derived Cancer- Peripheral Blood, Post-treatment", # 42
         "Cell line from patient tumor", # 50
         "Xenograft from patient not grown as intermediate on plastic tissue culture dish", # 60
         "Xenograft grown in mice from established cell lines", #61
dc73efdc
         "Next Generation Cancer Model", # 85
045b7659
         "Granulocytes after a Ficoll separation"
     ) # 99
dc73efdc
     aux <- data.frame(tissue.code = tissue.code,definition)
 
     # Exceptions: TARGET-AML
     # "TARGET-20-PAYKXV-Sorted-CD34neg-Lymphoid-09A" 727f5e27-09e9-425f-9757-81da66c054bd
     # "TARGET-20-PAWUEX-EOI2-14A"  11fb7cce-06ad-445b-b09f-9dc1050e4cd4
     # "TARGET-20-PAUSTZ-EOI1-14A" 3ad6dfa7-734d-41b0-ad66-15f0cff630d0
     # "TARGET-20-PAYHMK-Sorted-leukemic-09A" f5bd72ae-919f-481c-8203-f878ee1c651a
     # "TARGET-20-D7-Myeloid-mock3-85" 88712125-67e5-4e47-a413-e55b522a641c
     # Normal case: TARGET-20-PANLXK-09A-03R
045b7659
     # in case multiple equal barcode
dc73efdc
     #regex <- paste0("[:alnum:]{6}-[:alnum:]{2}-[:alnum:]{6}",
     #                "-[:alnum:]{3}-[:alnum:]{3}")
     #samples <- str_match(barcode,regex)[,1]
045b7659
 
dc73efdc
     # this breaks for the exceptions
     if(!any(grepl(";",barcode))){
         ret <- data.frame(
             barcode = barcode,
             tumor.code = stringr::str_extract(pattern = "TARGET-[[:alnum:]]{2}",barcode) %>% gsub(pattern = "TARGET-","",.),
             sample = gsub(pattern = "-[[:alnum:]]{3}$","",barcode),
             patient = gsub(pattern = "-[[:alnum:]]{2,3}-[[:alnum:]]{3}$","",barcode),
             case.unique.id = gsub(pattern = "-[[:alnum:]]{2,3}-[[:alnum:]]{3}$","",barcode) %>%  gsub(pattern = "TARGET-[[:alnum:]]{2}-","",.),
             tissue.code = barcode %>% stringr::str_extract("[[:alnum:]]{2,3}-[[:alnum:]]{3}$") %>% stringr::str_extract("[[:alnum:]]{2,3}") %>% stringr::str_sub(1,2),
             nucleic.acid.code = barcode %>% stringr::str_extract("[[:alnum:]]{3}$")  %>% stringr::str_sub(3,3)
         )
     } else {
         # mixed samples case
         ret <- purrr::map_dfr(.x = barcode,.f = function(b){
             data.frame(
                 barcode = b,
                 tumor.code = stringr::str_extract(pattern = "TARGET-[[:alnum:]]{2}",b) %>% gsub(pattern = "TARGET-","",.),
                 sample = b %>% stringr::str_split(";") %>% unlist %>% sapply(FUN = function(y) gsub(pattern = "-[[:alnum:]]{3}$","",y)) %>% paste0(collapse = ";"),
                 patient = b %>% stringr::str_split(";") %>% unlist %>% sapply(FUN = function(y)  gsub(pattern = "-[[:alnum:]]{2,3}-[[:alnum:]]{3}$","",y)) %>% paste0(collapse = ";"),
                 case.unique.id = b %>% stringr::str_split(";") %>% unlist %>% sapply(FUN = function(y)  gsub(pattern = "-[[:alnum:]]{2,3}-[[:alnum:]]{3}$","",y)  %>%  gsub(pattern = "TARGET-[[:alnum:]]{2}-","",.) %>% paste0(collapse = ";")),
                 tissue.code = b %>% stringr::str_extract("[[:alnum:]]{2,3}-[[:alnum:]]{3}$") %>% stringr::str_extract("[[:alnum:]]{2,3}") %>% stringr::str_sub(1,2),
                 nucleic.acid.code = b %>% stringr::str_extract("[[:alnum:]]{3}$")  %>% stringr::str_sub(3,3)
             )
         })
     }
     ret <-  ret %>% dplyr::left_join(aux, by = "tissue.code")
045b7659
 
     tumor.code <- c('00','01','02','03','04','10','15','20','21','30','40',
                     '41','50','51','52','60','61','62','63','64','65','70','71','80','81')
 
     tumor.definition <- c(
         "Non-cancerous tissue", # 00
         "Diffuse Large B-Cell Lymphoma (DLBCL)", # 01
         "Lung Cancer (all types)", # 02
         "Cervical Cancer (all types)", # 03
         "Anal Cancer (all types)", # 04
         "Acute lymphoblastic leukemia (ALL)", # 10
         "Mixed phenotype acute leukemia (MPAL)", # 15
         "Acute myeloid leukemia (AML)", # 20
         "Induction Failure AML (AML-IF)", # 21
         "Neuroblastoma (NBL)", # 30
         "Osteosarcoma (OS)",  # 40
         "Ewing sarcoma",   # 41
         "Wilms tumor (WT)", # 50
         "Clear cell sarcoma of the kidney (CCSK)", # 51
         "Rhabdoid tumor (kidney) (RT)", # 52
         "CNS, ependymoma", # 60
         "CNS, glioblastoma (GBM)", # 61
         "CNS, rhabdoid tumor", # 62
         "CNS, low grade glioma (LGG)", # 63
         "CNS, medulloblastoma", # 64
         "CNS, other", # 65
         "NHL, anaplastic large cell lymphoma", # 70
         "NHL, Burkitt lymphoma (BL)", # 71
         "Rhabdomyosarcoma", #80
         "Soft tissue sarcoma, non-rhabdomyosarcoma"
     ) # 81
dc73efdc
     aux <- data.frame(tumor.code = tumor.code,tumor.definition)
     ret <- ret %>% dplyr::left_join(aux, by = "tumor.code")
045b7659
 
     nucleic.acid.code <- c('D','E','W','X','Y','R','S')
     nucleic.acid.description <-  c(
         "DNA, unamplified, from the first isolation of a tissue",
         "DNA, unamplified, from the first isolation of a tissue embedded in FFPE",
         "DNA, whole genome amplified by Qiagen (one independent reaction)",
         "DNA, whole genome amplified by Qiagen (a second, separate independent reaction)",
         "DNA, whole genome amplified by Qiagen (pool of 'W' and 'X' aliquots)",
         "RNA, from the first isolation of a tissue",
         "RNA, from the first isolation of a tissue embedded in FFPE"
     )
dc73efdc
     aux <- data.frame(nucleic.acid.code = nucleic.acid.code,nucleic.acid.description)
     ret <- ret %>% dplyr::left_join(aux, by = "nucleic.acid.code")
045b7659
 
     ret <- ret[match(barcode,ret$barcode),]
     rownames(ret) <- gsub("\\.","-",make.names(ret$barcode,unique=TRUE))
     ret$code <- NULL
     return(DataFrame(ret))
c6c5016d
 }
 
 colDataPrepareTCGA <- function(barcode){
045b7659
     # For the moment this will work only for TCGA Data
     # We should search what TARGET data means
 
c8404f5f
     code <- c(
         '01','02','03','04','05','06','07','08','09','10','11',
         '12','13','14','20','40','50','60','61'
     )
     shortLetterCode <- c(
         "TP","TR","TB","TRBM","TAP","TM","TAM","THOC",
         "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB",
         "CELL","XP","XCL"
     )
 
     definition <- c(
         "Primary solid Tumor", # 01
         "Recurrent Solid Tumor", # 02
         "Primary Blood Derived Cancer - Peripheral Blood", # 03
         "Recurrent Blood Derived Cancer - Bone Marrow", # 04
         "Additional - New Primary", # 05
         "Metastatic", # 06
         "Additional Metastatic", # 07
         "Human Tumor Original Cells", # 08
         "Primary Blood Derived Cancer - Bone Marrow", # 09
         "Blood Derived Normal", # 10
         "Solid Tissue Normal",  # 11
         "Buccal Cell Normal",   # 12
         "EBV Immortalized Normal", # 13
         "Bone Marrow Normal", # 14
         "Control Analyte", # 20
         "Recurrent Blood Derived Cancer - Peripheral Blood", # 40
         "Cell Lines", # 50
         "Primary Xenograft Tissue", # 60
         "Cell Line Derived Xenograft Tissue"
     ) # 61
045b7659
     aux <- DataFrame(code = code,shortLetterCode,definition)
 
     # in case multiple equal barcode
     regex <- paste0("[:alnum:]{4}-[:alnum:]{2}-[:alnum:]{4}",
                     "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}")
     samples <- str_match(barcode,regex)[,1]
 
c8404f5f
     ret <- DataFrame(
         barcode = barcode,
         patient = substr(barcode, 1, 12),
         sample = substr(barcode, 1, 16),
         code = substr(barcode, 14, 15)
     )
045b7659
     ret <- merge(ret,aux, by = "code", sort = FALSE)
     ret <- ret[match(barcode,ret$barcode),]
     rownames(ret) <- gsub("\\.","-",make.names(ret$barcode,unique=TRUE))
     ret$code <- NULL
     return(DataFrame(ret))
178c3b9a
 }
 
cc4e8cfd
 #' @title Create samples information matrix for GDC samples
21321b60
 #' @description Create samples information matrix for GDC samples add subtype information
a90c7d63
 #' @param barcode TCGA or TARGET barcode
f57861ed
 #' @importFrom plyr rbind.fill
a4b2a12d
 #' @importFrom dplyr left_join
e385d93e
 #' @examples
c1e5918e
 #'  metadata <- colDataPrepare(c("TCGA-OR-A5K3-01A","C3N-00321-01"))
 #'  metadata <- colDataPrepare(c("BLGSP-71-06-00157-01A",
 #'                               "BLGSP-71-22-00332-01A"))
a3f2bc5f
 #' @export
c6c5016d
 colDataPrepare <- function(barcode){
045b7659
     # For the moment this will work only for TCGA Data
     # We should search what TARGET data means
     message("Starting to add information to samples")
     ret <- NULL
 
     if(all(grepl("TARGET",barcode))) ret <- colDataPrepareTARGET(barcode)
     if(all(grepl("TCGA",barcode))) ret <- colDataPrepareTCGA(barcode)
     if(all(grepl("MMRF",barcode))) ret <- colDataPrepareMMRF(barcode)
 
     # How to deal with mixed samples "C3N-02003-01;C3N-02003-021" ?
6281eff8
     # Check if this breaks the package
045b7659
     if(any(grepl("C3N-|C3L-",barcode))) {
         ret <- data.frame(
b2bc706f
             sample = map(barcode,.f = function(x) stringr::str_split(x,";") %>% unlist) %>% unlist()
045b7659
         )
     }
b2bc706f
 
045b7659
     if(is.null(ret)) {
         ret <- data.frame(
             sample = barcode %>% unique,
             stringsAsFactors = FALSE
         )
     }
 
     message(" => Add clinical information to samples")
     # There is a limitation on the size of the string, so this step will be splited in cases of 100
     patient.info <- NULL
 
     patient.info <- splitAPICall(
         FUN = getBarcodeInfo,
         step = 10,
         items = ret$sample
38297df4
     )
dc73efdc
 
045b7659
     if(!is.null(patient.info)) {
         ret$sample_submitter_id <- ret$sample %>% as.character()
         ret <- left_join(ret %>% as.data.frame, patient.info %>% unique, by = "sample_submitter_id")
     }
     ret$bcr_patient_barcode <- ret$sample %>% as.character()
7697beda
     ret$sample_submitter_id <- ret$sample %>% as.character()
045b7659
     if(!"project_id" %in% colnames(ret)) {
         if("disease_type" %in% colnames(ret)){
             aux <- getGDCprojects()[,c(5,7)]
             aux <- aux[aux$disease_type == unique(ret$disease_type),2]
             ret$project_id <- as.character(aux)
         }
7697beda
     }
045b7659
     # There is no subtype info for target, return as it is
     if(any(grepl("TCGA",barcode))) {
         ret <- addSubtypeInfo(ret)
     }
 
     # na.omit should not be here, exceptional case
b2bc706f
     if(is.null(ret)) {
         return(
             data.frame(
                 row.names = barcode,
                 barcode,
                 stringsAsFactors = FALSE
             )
         )
     }
045b7659
 
     # Add purity information from http://www.nature.com/articles/ncomms9971
     # purity  <- getPurityinfo()
     # ret <- merge(ret, purity, by = "sample", all.x = TRUE, sort = FALSE)
 
     # Put data in the right order
     ret <- ret[!duplicated(ret$bcr_patient_barcode),]
 
     # This part might not work with multiple projects
     idx <- sapply(
         X = substr(barcode,1,min(stringr::str_length(ret$bcr_patient_barcode))),
         FUN =  function(x) {
             grep(x,ret$bcr_patient_barcode)
         }
     )
     # the code above does not work, since the projects have different string lengths
ad025eb9
     if(all(na.omit(ret$project_id) %in% c("TARGET-ALL-P3","TARGET-AML"))) {
045b7659
         idx <- sapply(gsub("-[[:alnum:]]{3}$","",barcode), function(x) {
             grep(x,ret$bcr_patient_barcode)
         })
     }
 
     if(any(ret$project_id == "CPTAC-3",na.rm = T)) {
0cb9cae4
 
f18368c6
         # only merge mixed samples
b2bc706f
         mixed_samples <- grep(";",barcode,value = T)
         if(length(mixed_samples) > 0){
             mixed_samples <- mixed_samples %>% str_split(";") %>% unlist %>% unique
f18368c6
 
b2bc706f
             ret_mixed_samples <- ret %>% dplyr::filter(sample_submitter_id %in% mixed_samples) %>%
                 dplyr::group_by(submitter_id) %>%
f18368c6
                 dplyr::summarise_all(~trimws(paste(unique(.), collapse = ';'))) %>%
                 as.data.frame()
b2bc706f
             ret <- rbind(ret_mixed_samples,ret)
f18368c6
         }
0cb9cae4
         idx <- match(barcode,ret$bcr_patient_barcode)
 
         #idx <- sapply(gsub("-[[:alnum:]]{3}$","",barcode), function(x) {
         #    if(grepl(";",x = x)) x <- stringr::str_split(x[1],";")[[1]][1] # mixed samples
         #    grep(x,ret$bcr_patient_barcode)
         #})
 
045b7659
     }
 
dc73efdc
     if(any(ret$project_id %in% c("CMI-MBC","TARGET-NBL"),na.rm = T)) {
045b7659
         idx <- match(barcode,ret$bcr_patient_barcode)
     }
 
 
     if(is.list(idx)){
         stop(
             "Prepare will not be possible.
             \nIf you are trying to prepare more than
              one different project at a time, please do it separately"
         )
     }
 
 
     ret <- ret[idx,]
 
     if("barcode" %in% colnames(ret)) ret$barcode <- barcode
 
     rownames(ret) <- barcode
     return(ret)
c6c5016d
 }
 
e4b4e205
 addSubtypeInfo <- function(ret){
045b7659
     out <- NULL
     message(" => Adding TCGA molecular information from marker papers")
     message(" => Information will have prefix 'paper_' ")
 
     for(proj in na.omit(unique(ret$project_id))){
         if(grepl("TCGA",proj,ignore.case = TRUE)) {
             # remove letter from 01A 01B etc
             ret$sample.aux <- substr(ret$sample,1,15)
 
             tumor <- gsub("TCGA-","",proj)
             available <- c(
                 "ACC",
                 "BRCA",
                 "BLCA",
                 "CESC",
                 "CHOL",
                 "COAD",
                 "ESCA",
                 "GBM",
                 "HNSC",
                 "KICH",
                 "KIRC",
                 "KIRP",
                 "LGG",
                 "LUAD",
                 "LUSC",
                 "PAAD",
                 "PCPG",
                 "PRAD",
                 "READ",
                 "SKCM",
                 "SARC",
                 "STAD",
                 "THCA",
                 "UCEC",
                 "UCS",
                 "UVM"
             )
             if (grepl(paste(c(available,"all"),collapse = "|"),tumor,ignore.case = TRUE)) {
                 subtype <- TCGAquery_subtype(tumor)
                 colnames(subtype) <- paste0("paper_", colnames(subtype))
 
                 if(all(str_length(subtype$paper_patient) == 12)){
                     # Subtype information were to primary tumor in priority
                     subtype$sample.aux <- paste0(subtype$paper_patient,"-01")
                 }
                 ret.aux <- ret[ret$sample.aux %in% subtype$sample.aux,]
                 ret.aux <- merge(ret.aux,subtype, by = "sample.aux", all.x = TRUE)
                 out <- rbind.fill(as.data.frame(out),as.data.frame(ret.aux))
             }
e4b4e205
         }
     }
045b7659
     if(is.null(out)) return(ret)
 
     # We need to put together the samples with subtypes with samples without subytpes
     ret.aux <- ret[!ret$sample %in% out$sample,]
     ret <- rbind.fill(as.data.frame(out),as.data.frame(ret.aux))
     ret$sample.aux <- NULL
 
     return(ret)
e4b4e205
 }
178c3b9a
 
 readProteinExpression <- function(files,cases) {
045b7659
     pb <- txtProgressBar(min = 0, max = length(files), style = 3)
     for (i in seq_along(files)) {
         data <- read_tsv(file = files[i], col_names = TRUE,skip = 1, col_types = c("cn"))
         if(!missing(cases))  colnames(data)[2] <- cases[i]
         if(i == 1) df <- data
         if(i != 1) df <- merge(df, data,all=TRUE, by="Composite Element REF")
         setTxtProgressBar(pb, i)
     }
     close(pb)
 
     return(df)
178c3b9a
 }
 
785ee06b
 readProteomeProfiling <- function(files,cases) {
045b7659
 
     list.of.protein.df <- plyr::alply(files,1, function(f) {
         data <- readr::read_tsv(
             file = f,
             show_col_types = FALSE,
             col_names = TRUE,
             progress = FALSE
         )
         colnames(data)[ncol(data)] <- cases[which(f == files)]
         data
     }, .progress = "time")
 
     # Just check if the data is in the same order, since we will not merge
     # the data frames to save memory
 
     df <- tryCatch({
         stopifnot(all(unlist(list.of.protein.df %>% map(function(y){all(y[,5] ==  list.of.protein.df[[1]][,5])}) )))
 
         # need to check if it works in all cases
         # tested for HTSeq - FPKM-UQ and Counts only
         bind_cols(list.of.protein.df[[1]][,1:5],list.of.protein.df %>%  map_df(6))
     },error = function(e){
         message("Some files have a  different number of proteins, we will introduce NA for the missing values")
         plyr::join_all(dfs = list.of.protein.df ,type = "full",by = c("AGID","lab_id","catalog_number","set_id","peptide_target"))
     })
 
     if(!missing(cases))  colnames(df)[-c(1:5)] <- cases
 
     return(df)
785ee06b
 }
 
 makeSEfromProteomeProfiling <- function(){
045b7659
 
785ee06b
 }
079aab6c
 
7697beda
 makeSEfromTranscriptomeProfilingSTAR <- function(
045b7659
         data,
         cases
7697beda
 ){
045b7659
     # How many cases do we have?
     # We wil consider col 1 is the ensemble gene id, other ones are data
     size <- ncol(data)
     # Prepare Patient table
     colData <-  colDataPrepare(cases)
 
     # one ensemblID can be mapped to multiple entrezgene ID
     gene.location <- get.GRCh.bioMart("hg38",as.granges = TRUE)
 
     metrics <- subset(data, !grepl("ENSG", data$gene_id))
     data <- subset(data, grepl("ENSG", data$gene_id))
     found.genes <- table(data$gene_id %in% gene.location$gene_id)
 
     if ("FALSE" %in% names(found.genes)){
         message(paste0("From the ", nrow(data), " genes we couldn't map ", found.genes[["FALSE"]]))
     }
 
     # Prepare data table
     # Remove the version from the ensembl gene id
     assays <- list(
         data.matrix(data[,grep("^unstranded",colnames(data)),with = FALSE]),
         data.matrix(data[,grep("stranded_first",colnames(data)),with = FALSE]),
         data.matrix(data[,grep("stranded_second",colnames(data)),with = FALSE]),
         data.matrix(data[,grep("tpm_unstrand",colnames(data)),with = FALSE]),
         data.matrix(data[,grep("fpkm_unstrand",colnames(data)),with = FALSE]),
         data.matrix(data[,grep("fpkm_uq_unstrand",colnames(data)),with = FALSE])
     )
     names(assays) <- c(
         "unstranded",
         "stranded_first",
         "stranded_second",
         "tpm_unstrand",
         "fpkm_unstrand",
         "fpkm_uq_unstrand"
     )
     assays <- lapply(assays, function(x){
         colnames(x) <- NULL
         rownames(x) <- NULL
         return(x)
     })
 
     # Prepare rowRanges
     rowRanges <- gene.location[match(data$gene_id, gene.location$gene_id),]
     names(rowRanges) <- as.character(data$gene_id)
 
 
     message("Available assays in SummarizedExperiment : \n  => ",paste(names(assays), collapse = "\n  => "))
     rse <- SummarizedExperiment(
         assays = assays,
         rowRanges = rowRanges,
         colData = colData
     )
     metadata(rse) <- metrics
     return(rse)
079aab6c
 }
 
 
c6c5016d
 makeSEfromTranscriptomeProfiling <- function(data, cases, assay.list){
045b7659
     # How many cases do we have?
     # We wil consider col 1 is the ensemble gene id, other ones are data
     size <- ncol(data)
     # Prepare Patient table
     colData <-  colDataPrepare(cases)
 
     # one ensemblID can be mapped to multiple entrezgene ID
     gene.location <- get.GRCh.bioMart("hg38")
     gene.location <- gene.location[!duplicated(gene.location$ensembl_gene_id),]
 
     data$ensembl_gene_id <- as.character(gsub("\\.[0-9]*","",data$X1))
     data <- subset(data, grepl("ENSG", data$ensembl_gene_id))
     found.genes <- table(data$ensembl_gene_id %in% gene.location$ensembl_gene_id)
 
     if("FALSE" %in% names(found.genes))
         message(paste0("From the ", nrow(data), " genes we couldn't map ", found.genes[["FALSE"]]))
 
     data <- merge(data, gene.location, by = "ensembl_gene_id")
 
     # Prepare data table
     # Remove the version from the ensembl gene id
     assays <- list(data.matrix(data[,2:size+1]))
     names(assays) <- assay.list
     assays <- lapply(assays, function(x){
         colnames(x) <- NULL
         rownames(x) <- NULL
         return(x)
     })
 
     # Prepare rowRanges
     rowRanges <- GRanges(
         seqnames = paste0("chr", data$chromosome_name),
         ranges = IRanges(
             start = data$start_position,
             end = data$end_position
         ),
         strand = data$strand,
         ensembl_gene_id = data$ensembl_gene_id,
         external_gene_name = data$external_gene_name,
         original_ensembl_gene_id = data$X1
     )
     names(rowRanges) <- as.character(data$ensembl_gene_id)
     message("Available assays in SummarizedExperiment : \n  => ",paste(names(assays), collapse = "\n  => "))
     rse <- SummarizedExperiment(
         assays = assays,
         rowRanges = rowRanges,
         colData = colData
     )
 
     return(rse)
c6c5016d
 }
 
6bf60949
 
 #' @importFrom dplyr left_join
d7615b50
 #' @importFrom plyr alply join_all
dbee506d
 #' @importFrom purrr map_dfc map map_df
d7615b50
 readTranscriptomeProfiling <- function(
045b7659
         files,
         data.type,
         workflow.type,
         cases,
         summarizedExperiment
d7615b50
 ) {
045b7659
 
     if(grepl("Gene Expression Quantification", data.type, ignore.case = TRUE)){
         # Status working for:
         #  - htseq
         #  - FPKM
         #  - FPKM-UQ
         if(grepl("HTSeq",workflow.type)){
 
             x <- plyr::alply(files,1, function(f) {
                 readr::read_tsv(
                     file = f,
                     col_names = FALSE,
                     progress = FALSE,
                     col_types = c("cd")
                 )
             }, .progress = "time")
 
             # Just check if the data is in the same order, since we will not merge
             # the data frames to save memory
             stopifnot(all(unlist(x %>% map(function(y){all(y[,1] ==  x[[1]][,1])}) )))
 
             # need to check if it works in all cases
             # tested for HTSeq - FPKM-UQ and Counts only
             df <- bind_cols(x[[1]][,1],x %>%  map_df(2))
             if(!missing(cases))  colnames(df)[-1] <- cases
             if(summarizedExperiment) df <- makeSEfromTranscriptomeProfiling(df,cases,workflow.type)
         } else  if(grepl("STAR",workflow.type)){
 
             # read files that has 4 not necessary rows, and has several columns
             # gene_id gene_name gene_type
             # unstranded stranded_first stranded_second tpm_unstranded fpkm_unstranded
             x <- plyr::alply(files,1, function(f) {
                 data.table::fread(f)
             }, .progress = "time")
 
             df <- data.table::rbindlist(
                 x, use.names = TRUE, idcol = "case_barcode"
             )
3ff7686f
 
             # Exception to the code below: CPTAC-3
             # Sample barcode in CPTAC-3 does not handle duplicates
             # we will need to work with aliquots for it or remove duplicated
             # files. Example: C3N-02765-02
 
045b7659
             if(!missing(cases))  {
                 df$case_barcode <- factor(
                     cases[df$case_barcode %>% as.numeric()],
3ff7686f
                     levels = unique(cases)
045b7659
                 )
             }
 
             # this part changes the cases order if not a factor
             df <- data.table::dcast(
                 data = df,
                 formula = gene_id + gene_name + gene_type ~ case_barcode,
                 value.var = colnames(df)[-c(1:4)]
             )
 
             # Adding barcodes to columns names, if user wants a dataframe
 
             if(summarizedExperiment) {
                 df <- makeSEfromTranscriptomeProfilingSTAR(
                     data = df,
                     cases = cases
                 )
             }
         }
     } else if(grepl("miRNA", workflow.type, ignore.case = TRUE) & grepl("miRNA", data.type, ignore.case = TRUE)) {
         pb <- txtProgressBar(min = 0, max = length(files), style = 3)
         for (i in seq_along(files)) {
             data <- read_tsv(file = files[i], col_names = TRUE,col_types = "cidc")
             if(!missing(cases))
                 colnames(data)[2:ncol(data)] <- paste0(colnames(data)[2:ncol(data)],"_",cases[i])
 
             if(i == 1) df <- data
             if(i != 1) df <- merge(df, data, by=colnames(df)[1],all = TRUE)
             setTxtProgressBar(pb, i)
         }
         close(pb)
 
     } else if(grepl("Isoform Expression Quantification", data.type, ignore.case = TRUE)){
         pb <- txtProgressBar(min = 0, max = length(files), style = 3)
         for (i in seq_along(files)) {
             data <- read_tsv(file = files[i], col_names = TRUE, col_types = c("ccidcc"))
             if(!missing(cases)) data$barcode <- cases[i] else data$file <- i
             if(i == 1) df <- data
             if(i != 1) df <- rbind(df,data)
             setTxtProgressBar(pb, i)
         }
         close(pb)
38297df4
     }
045b7659
     return(df)
178c3b9a
 }
 
b2bc706f
 read_gene_level_copy_number <- function(
         files,
         cases,
         summarizedExperiment = FALSE
 ){
045b7659
     message("Reading Gene Level Copy Number files")
     gistic.df <- NULL
     gistic.list <- plyr::alply(files,1,.fun = function(file) {
         #message("Reading file: ", file)
         data <- read_tsv(
             file = file,
             col_names = TRUE,
             progress = TRUE,
             col_types = readr::cols()
         )
         colnames(data)[-c(1:5)] <- paste0(cases[match(file,files)],"_",  colnames(data)[-c(1:5)])
 
         return(data)
     })
 
     # Check if the data is in the same order
     stopifnot(all(unlist(gistic.list %>% map(function(y){all(y[,1:5] ==  gistic.list[[1]][,1:5])}) )))
 
     # need to check if it works in all cases
     # tested for HTSeq - FPKM-UQ and Counts only
     df <- bind_cols(
         gistic.list[[1]][,1:5], # gene info
         gistic.list %>%  map_dfc(.f = function(x) x[,6:8]) # copy number, min,max
dd5cb063
     )
045b7659
 
     if(summarizedExperiment) {
b2bc706f
         se <- make_se_from_gene_level_copy_number(df, cases)
045b7659
         return(se)
     }
     return(df)
dd5cb063
 }
 
b2bc706f
 make_se_from_gene_level_copy_number <- function(df, cases){
045b7659
     message("Creating a SummarizedExperiment object")
     rowRanges <- GRanges(
         seqnames = df$chromosome,
         ranges = IRanges(start = df$start, end = df$end),
         gene_id = df$gene_id,
         gene_name = df$gene_name
     )
 
     names(rowRanges) <- as.character(df$gene_id)
     colData <-  colDataPrepare(cases)
 
     # We have three data columns for each files
     assays <- lapply(
         c("[^max|min]_copy_number","min_copy_number", "max_copy_number"),
         function(x){
             ret <- data.matrix(subset(df,select = grep(x,colnames(df))))
             colnames(ret) <- cases
             rownames(ret) <- NULL
             ret
         })
     names(assays) <- c("copy_number","min_copy_number", "max_copy_number")
 
     message("Available assays in SummarizedExperiment : \n  => ",paste(names(assays), collapse = "\n  => "))
     rse <- SummarizedExperiment(assays = assays, rowRanges = rowRanges, colData = colData)
     return(rse)
7b7f18e0
 }
 
 
c8c9c620
 readGISTIC <- function(files, cases){
045b7659
     message("Reading GISTIC file")
     gistic.df <- NULL
     gistic.list <- plyr::alply(files,1,.fun = function(file) {
         message("Reading file: ", file)
         data <- read_tsv(
             file = file,
             col_names = TRUE,
             progress = TRUE,
             col_types = readr::cols()
         )
 
         aliquot <- colnames(data)[-c(1:3)]
         info <- splitAPICall(
             FUN = getBarcodefromAliquot,
             step = 20,
             items = aliquot
         )
 
         barcode <- as.character(info$submitter_id)[match(aliquot,as.character(info$aliquot_id))]
         colnames(data)[-c(1:3)] <- barcode
         return(data)
     })
     gistic.df <- gistic.list %>%
         join_all(by =  c("Gene Symbol","Gene ID","Cytoband"), type='full')
 
     return(gistic.df)
c8c9c620
 }
 
717cfd63
 # Reads Copy Number Variation files to a data frame, basically it will rbind it
f3faeb86
 #' @importFrom purrr map2_dfr
b2bc706f
 read_copy_number_variation <- function(files, cases){
045b7659
     message("Reading copy number variation files")
 
     col_types <- ifelse(any(grepl('ascat2', files)),"ccnnnnn","ccnnnd")
 
     purrr::map2_dfr(
         .x = files,
         .y = cases,
         .f = function(file,case) {
             data <- readr::read_tsv(file, col_names = TRUE, col_types = col_types);
             if(!missing(case)) data$Sample <- case
             data
         })
178c3b9a
 }
 
57189bb2
 # getBarcodeInfo(c("TCGA-A6-6650-01B"))
9d34d5f7
 #  getBarcodeInfo(c("DLBCL10508-sample"))
57189bb2
 addFFPE <- function(df) {
045b7659
     message("Add FFPE information. More information at: \n=> https://cancergenome.nih.gov/cancersselected/biospeccriteria \n=> http://gdac.broadinstitute.org/runs/sampleReports/latest/FPPP_FFPE_Cases.html")
     barcode <- df$barcode
     patient <- df$patient
 
     ffpe.info <- NULL
     ffpe.info <- splitAPICall(FUN = getFFPE,
                               step = 20,
                               items = df$patient)
 
     df <- merge(df, ffpe.info,by.x = "sample", by.y = "submitter_id")
     df <- df[match(barcode,df$barcode),]
     return(df)
57189bb2
 }
 
008efdf3
 # getFFPE("TCGA-A6-6650")
 #' @importFrom plyr rbind.fill
 #' @importFrom httr content
57189bb2
 getFFPE <- function(patient){
045b7659
     baseURL <- "https://api.gdc.cancer.gov/cases/?"
     options.pretty <- "pretty=true"
     options.expand <- "expand=samples"
     option.size <- paste0("size=",length(patient))
b2bc706f
     options.filter <- paste0(
         "filters=",
         URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['),
         paste0('"',paste(patient,collapse = '","')),
         URLencode('"]}}]}')
     )
045b7659
     url <- paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&"))
     json  <- tryCatch(
         getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE),
         error = function(e) {
             message(paste("Error: ", e, sep = " "))
             message("We will retry to access GDC again! URL:")
             #message(url)
             fromJSON(content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE)
         }
     )
 
     results <- json$data$hits
     results <- rbind.fill(results$samples)[,c("submitter_id","is_ffpe")]
     return(results)
57189bb2
 }
c8c9c620
 
 getAliquot_ids <- function(barcode){
045b7659
     baseURL <- "https://api.gdc.cancer.gov/cases/?"
     options.fields <- "fields=samples.portions.analytes.aliquots.aliquot_id,samples.portions.analytes.aliquots.submitter_id"
     options.pretty <- "pretty=true"
     option.size <- paste0("size=",length(barcode))
     #message(paste(barcode,collapse = '","'))
     #message(paste0('"',paste(barcode,collapse = '","')))
b2bc706f
     options.filter <- paste0(
         "filters=",
         URLencode('{"op":"and","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['),
         paste0('"',paste(barcode,collapse = '","')),
         URLencode('"]}}]}')
     )
045b7659
     #message(paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&")))
     url <- paste0(baseURL,paste(options.pretty,options.fields, option.size, options.filter, sep = "&"))
     json  <- tryCatch(
         getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE),
         error = function(e) {
             message(paste("Error: ", e, sep = " "))
             message("We will retry to access GDC again! URL:")
             #message(url)
             fromJSON(content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE)
         }
     )
     results <- unlist(json$data$hits$samples)
     results.barcode <- grep("TCGA",results,value = TRUE)
     results.aliquot <- grep("TCGA",results,value = TRUE,invert = TRUE)
 
     df <- data.frame(results.aliquot,results.barcode)
     colnames(df) <- c("aliquot_id","barcode")
     return(df)
c8c9c620
 }
e4b4e205
 
 
 # getBarcodeInfo(c("TCGA-OR-A5K3-01A","C3N-00321-01"))
 # barcode is: sample_submitter_id
eb4267de
 #' @importFrom dplyr bind_cols slice row_number
 #'
dc887359
 getBarcodeInfo <- function(barcode) {
045b7659
     baseURL <- "https://api.gdc.cancer.gov/cases/?"
     options.pretty <- "pretty=true"
     options.expand <- "expand=samples,project,diagnoses,diagnoses.treatments,annotations,family_histories,demographic,exposures"
     option.size <- paste0("size=",length(barcode))
     options.filter <- paste0("filters=",
                              URLencode('{"op":"or","content":[{"op":"in","content":{"field":"cases.submitter_id","value":['),
                              paste0('"',paste(barcode,collapse = '","')),
                              URLencode('"]}},'),
                              URLencode('{"op":"in","content":{"field":"submitter_sample_ids","value":['),
                              paste0('"',paste(barcode,collapse = '","')),
                              URLencode('"]}},'),
                              URLencode('{"op":"in","content":{"field":"submitter_aliquot_ids","value":['),
                              paste0('"',paste(barcode,collapse = '","')),
                              URLencode('"]}}'),
                              URLencode(']}')
     )
     url <- paste0(baseURL,paste(options.pretty,options.expand, option.size, options.filter, sep = "&"))
     #message(url)
     json  <- tryCatch(
         getURL(url,fromJSON,timeout(600),simplifyDataFrame = TRUE),
         error = function(e) {
             message(paste("Error: ", e, sep = " "))
             message("We will retry to access GDC again! URL:")
             #message(url)
             fromJSON(content(getURL(url,GET,timeout(600)), as = "text", encoding = "UTF-8"), simplifyDataFrame = TRUE)
         }
     )
 
     results <- json$data$hits
 
     # no results
     if(length(results) == 0){
         return(data.frame(barcode,stringsAsFactors = FALSE))
9bda1de1
     }
045b7659
 
     submitter_id <- results$submitter_id
     submitter_aliquot_ids <- results$submitter_aliquot_ids
 
     if(!is.null(results$samples)) {
         samples <- rbindlist(results$samples, fill = TRUE)
         samples <- samples[match(barcode,samples$submitter_id),]
9d34d5f7
         samples$sample_submitter_id <- str_extract_all(
             samples$submitter_id,paste(barcode,collapse = "|")) %>%
045b7659
             unlist %>% as.character
 
         tryCatch({
             samples$submitter_id <-
9d34d5f7
                 str_extract_all(
                     samples$submitter_id,
                     paste(c(submitter_id,barcode), collapse = "|"),
                     simplify = TRUE
                 ) %>% as.character
045b7659
         }, error = function(e){
             samples$submitter_id <- submitter_id
         })
         df <- samples[!is.na(samples$submitter_id),]
         suppressWarnings({
             df[,c("updated_datetime","created_datetime")] <- NULL
         })
b6275ad3
     }
045b7659
 
 
     # We dont have the same cols for TCGA and TARGET so we need to check them
     if(!is.null(results$diagnoses)) {
         diagnoses <- rbindlist(lapply(results$diagnoses, function(x) if(is.null(x)) data.frame(NA) else x),fill = TRUE)
         diagnoses[,c("updated_datetime","created_datetime","state")] <- NULL
         if(any(grepl("submitter_id", colnames(diagnoses)))) {
9d34d5f7
             diagnoses$submitter_id <- gsub("-diagnosis|_diagnosis.*|-DIAG|diag-","", diagnoses$submitter_id)
045b7659
         }  else {
             diagnoses$submitter_id <- submitter_id
         }
         # this is required since the sample might not have a diagnosis
         if(!any(df$submitter_id %in% diagnoses$submitter_id)){
             diagnoses$submitter_id <- NULL
             # The sample migth have different sample types
             # The diagnosis the same for each one of these samples
             # in that case we will have a 1 to mapping and binding will
             # not work. We need then to replicate diagnosis to each sample
             # and not each patient
             # Cases can be replicated with getBarcodeInfo(c("BA2691R","BA2577R","BA2748R"))
             if(nrow(diagnoses) <  nrow(df)){
                 diagnoses <- plyr::ldply(
                     1:length(results$submitter_sample_ids),
                     .fun = function(x){
                         diagnoses[x] %>% # replicate diagnoses the number of samples
                             as.data.frame() %>%
                             dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))})
             }
 
             df <- dplyr::bind_cols(df %>% as.data.frame,diagnoses %>% as.data.frame)
         } else {
             df <- left_join(df, diagnoses, by = "submitter_id")
         }
9bda1de1
     }
045b7659
     if(!is.null(results$exposures)) {
b2bc706f
         exposures <- rbindlist(
             lapply(results$exposures, function(x) if(is.null(x)) data.frame(NA) else x),
             fill = TRUE
         )
045b7659
         exposures[,c("updated_datetime","created_datetime","state")] <- NULL
         if(any(grepl("submitter_id", colnames(exposures)))) {
9d34d5f7
             exposures$submitter_id <- gsub("-exposure|_exposure.*|-EXP","", exposures$submitter_id)
045b7659
         }  else {
             exposures$submitter_id <- submitter_id
         }
         if(!any(df$submitter_id %in% exposures$submitter_id)){
             exposures$submitter_id <- NULL
             df <- dplyr::bind_cols(df,exposures)
         } else {
             df <- left_join(df, exposures, by = "submitter_id")
         }
b6275ad3
     }
045b7659
 
 
     if(!is.null(results$demographic)) {
         demographic <- results$demographic
         demographic[,c("updated_datetime","created_datetime","state")] <- NULL
         if(any(grepl("submitter_id", colnames(demographic)))) {
b2bc706f
             demographic$submitter_id <- gsub(
                 "-demographic|_demographic.*|-DEMO|demo-",
                 "",
                 results$demographic$submitter_id
             )
045b7659
         } else {
             demographic$submitter_id <- submitter_id
         }
 
         if(!any(df$submitter_id %in% demographic$submitter_id)){
             demographic$submitter_id <- NULL
             demographic$updated_datetime  <- NULL
             demographic$created_datetime <- NULL
             # The sample migth have different sample types
             # The diagnosis the same for each one of these samples
             # in that case we will have a 1 to mapping and binding will
             # not work. We need then to replicate diagnosis to each sample
             # and not each patient
             # Cases can be replicated with getBarcodeInfo(c("BA2691R","BA2577R","BA2748R"))
             if(nrow(demographic) <  nrow(df)){
                 demographic <- plyr::ldply(
                     1:length(results$submitter_sample_ids),
                     .fun = function(x){
                         demographic[x,] %>% # replicate diagnoses the number of samples
                             as.data.frame() %>%
b2bc706f
                             dplyr::slice(
                                 rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))
                     })
045b7659
             }
 
             df <- dplyr::bind_cols(df  %>% as.data.frame,demographic)
         } else {
             df <- left_join(df,demographic, by = "submitter_id")
         }
9bda1de1
     }
045b7659
 
     df$bcr_patient_barcode <- df$submitter_id %>% as.character()
     projects.info <- results$project
     projects.info <- results$project[,grep("state",colnames(projects.info),invert = TRUE)]
 
     if(any(submitter_id %in% df$submitter_id)){
         projects.info <-  cbind("submitter_id" = submitter_id, projects.info)
 
         suppressWarnings({
b2bc706f
             df <- left_join(
                 df,
                 projects.info,
                 by = "submitter_id"
             )
045b7659
         })
b6275ad3
     } else {
045b7659
 
         if(nrow(projects.info) <  nrow(df)){
             projects.info <- plyr::ldply(
                 1:length(results$submitter_sample_ids),
                 .fun = function(x){
                     projects.info[x,] %>% # replicate diagnoses the number of samples
                         as.data.frame() %>%
b2bc706f
                         dplyr::slice(
                             rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))
                 })
045b7659
         }
 
         df <- dplyr::bind_cols(df,projects.info)
b6275ad3
     }
045b7659
 
     # Adding in the same order
 
 
     if(any(substr(barcode,1,str_length(df$submitter_id)) %in% df$submitter_id)){
         df <- df[match(substr(barcode,1,str_length(df$sample_submitter_id)),df$sample_submitter_id),]
         # This line should not exists, but some patients does not have clinical data
         # case: TCGA-R8-A6YH"
         # this has been reported to GDC, waiting answers
         # So we will remove this NA cases
         df <- df[!is.na(df$submitter_id),]
7697beda
     } else {
045b7659
         idx <- sapply(substr(barcode,1,str_length(df$submitter_aliquot_ids) %>% max),
                       FUN = function(x){
                           grep(x,df$submitter_aliquot_ids)
                       })
         df <- df[idx,]
38297df4
     }
045b7659
 
     # remove empty columns
     df <- df %>% as.data.frame() %>% dplyr::select(which(colSums(is.na(df)) < nrow(df)))
     return(df)
dc887359
 }
 
cb8583c7
 #' @title Prepare CEL files into an AffyBatch.
 #' @description Prepare CEL files into an AffyBatch.
 #' @param ClinData write
 #' @param PathFolder write
 #' @param TabCel write
 #' @examples
 #' \dontrun{
 #' to add example
 #' }
 #' @export
86722ee7
 #' @return Normalized Expression data from Affy eSets
cb8583c7
 TCGAprepare_Affy <- function(ClinData, PathFolder, TabCel){
045b7659
     if (!requireNamespace("affy", quietly = TRUE)) {
         stop("affy package is needed for this function to work. Please install it.",
              call. = FALSE)
     }
     if (!requireNamespace("Biobase", quietly = TRUE)) {
         stop("affy package is needed for this function to work. Please install it.",
              call. = FALSE)
     }
     affy_batch <- affy::ReadAffy(filenames = as.character(paste(TabCel$samples, ".CEL", sep = "")))
 
     eset <- affy::rma(affy_batch)
 
     mat <- Biobase::exprs(eset)
 
     return(mat)
 
cb8583c7
 }