R/download.R
178c3b9a
 #' @title Download GDC data
 #' @description
95f473a5
 #'   Uses GDC API or GDC transfer tool to download gdc data
 #'   The user can use query argument
178c3b9a
 #'   The data from query will be save in a folder: project/data.category
 #' @param query A query for GDCquery function
86722ee7
 #' @param token.file Token file to download controlled data (only for method = "client")
14ebe74c
 #' @param method Uses the API (POST method) or gdc client tool. Options "api", "client".
c1b5e11c
 #' API is faster, but the data might get corrupted in the download, and it might need to be executed again
 #' @param directory Directory/Folder where the data was downloaded. Default: GDCdata
8f1bff6b
 #' @param files.per.chunk This will make the API method only download n (files.per.chunk) files at a time.
 #' This may reduce the download problems when the data size is too large. Expected a integer number (example files.per.chunk = 6)
95f473a5
 #' @importFrom tools md5sum
 #' @importFrom utils untar
0495b755
 #' @import httr
68de40aa
 #' @importFrom methods is
178c3b9a
 #' @export
04368b05
 #' @examples
8b20970c
 #' query <- GDCquery(
 #'   project = "TCGA-ACC",
 #'   data.category =  "Copy number variation",
 #'   legacy = TRUE,
 #'   file.type = "hg19.seg",
 #'   barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01")
 #'  )
98cbbbc9
 #' # data will be saved in  GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation
04368b05
 #' GDCdownload(query, method = "api")
68de40aa
 #' \dontrun{
cd1ab917
 #'     # Download clinical data from XML
 #'     query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical")
 #'     GDCdownload(query, files.per.chunk = 200)
15cdb167
 #'     query <- GDCquery(
 #'         project = "TARGET-AML",
 #'         data.category = "Transcriptome Profiling",
 #'         data.type = "miRNA Expression Quantification",
 #'         workflow.type = "BCGSC miRNA Profiling",
 #'         barcode = c("TARGET-20-PARUDL-03A-01R","TARGET-20-PASRRB-03A-01R")
 #'     )
411e4f8a
 #'     # data will be saved in:
 #'     # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification
 #'     GDCdownload(query, method = "client", directory = "example_data_dir")
15cdb167
 #'     acc.gbm <- GDCquery(
 #'         project =  c("TCGA-ACC","TCGA-GBM"),
 #'         data.category = "Transcriptome Profiling",
 #'         data.type = "Gene Expression Quantification",
 #'         workflow.type = "STAR - Counts"
 #'     )
 #'     GDCdownload(
 #'        query = acc.gbm,
 #'        method = "api",
 #'        directory = "example",
 #'        files.per.chunk = 50
 #'    )
c669f8e2
 #' }
178c3b9a
 #' @return Shows the output from the GDC transfer tools
02c318ae
 #' @author Tiago Chedraoui Silva
ed005a2a
 GDCdownload <- function(
     query,
     token.file,
     method = "api",
     directory = "GDCdata",
     files.per.chunk = NULL
 ) {
2ea5e186
     isServeOK()
ed005a2a
     if (missing(query)) stop("Please set query argument")
c669f8e2
 
ed005a2a
     if (!(method %in% c("api","client"))) {
         stop("method arguments possible values are: 'api' or 'client'")
     }
 
     if (length(unique(getResults(query)$data_type)) > 1) {
0276e37a
         print(knitr::kable(sort(unique(getResults(query)$data_type)),col.names = "data_type in query"))
ed005a2a
         stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.")
     }
c1b5e11c
 
dceb1d3e
     source <- ifelse(query$legacy,"legacy","harmonized")
c1b5e11c
 
     dir.create(directory, showWarnings = FALSE, recursive = TRUE)
b543f091
     for(proj in unique(unlist(query$project))){
         message("Downloading data for project ", proj)
         query.aux <- query
6168c8c5
         results <- getResults(query.aux)[getResults(query.aux)$project == proj,]
         query.aux$results[[1]] <- results
 
1692d5bd
         manifest <- getManifest(query.aux)
6168c8c5
 
ed005a2a
         path <- unique(
             file.path(
                 proj, source,
                 gsub(" ","_", results$data_category),
                 gsub(" ","_",results$data_type))
         )
b543f091
         path <- file.path(directory, path)
 
         # Check if the files were already downloaded by this package
480130c1
         manifest <- checkAlreadyDownloaded(path,manifest)
c1b5e11c
 
b543f091
         # There is a bug in the API, if the files has the same name it will not download correctly
         # so method should be set to client if there are files with duplicated names
         # However for clinical XML recurrent and primary are the same file. So we will ignore that case
ed005a2a
         if (nrow(manifest) > length(unique(manifest$filename))) method <- "client"
 
         if (nrow(manifest) != 0 & method == "client") {
b543f091
             # There exists two options to download the data, using the query or using a manifest file
             # The second option was created to let users use legacy data or the API to search
9f1566aa
 
b543f091
             # This will find gdc clinet, if not installed it will install it
             gdc.client.bin <- GDCclientInstall()
9e876a88
 
b543f091
             # Using the query argument we will organize the files to the user
             # Creates a file with the gdc manifest format
ed005a2a
             readr::write_delim(manifest,"gdc_manifest.txt",delim = "\t")
927f20fc
 
a930c77f
             readr::write_delim(manifest,"gdc_client_configuration.dtt",delim = "\t")
             readr::write_lines(
                 c("[download]","retry_amount = 6",paste0("dir =",path)),
                 file = "gdc_client_configuration.dtt"
             )
             cmd <- paste0(gdc.client.bin, " download -m gdc_manifest.txt --config gdc_client_configuration.dtt")
b543f091
 
a930c77f
             dir.create(path,recursive = TRUE,showWarnings = FALSE)
b543f091
             if(!missing(token.file)) cmd <- paste0(cmd," -t ", token.file)
 
             # Download all the files in the manifest using gdc client
             message(paste0("GDCdownload will download: ",
                            humanReadableByteCount(sum(as.numeric(manifest$size)))))
             message(paste0("Executing GDC client with the following command:\n",cmd))
             result = tryCatch({
                 system(cmd)
             }, warning = function(w) {
4a3dad6c
             }, error = function(e) {
             })
b543f091
 
         } else if (nrow(manifest) != 0 & method =="api"){
             if(nrow(manifest) > 1) {
                 name <- paste0(gsub(" |:","_",date()),".tar.gz")
                 unlink(name)
ed005a2a
                 message(
                     paste0(
                         "GDCdownload will download ", nrow(manifest), " files. A total of " ,
                         humanReadableByteCount(sum(as.numeric(manifest$size)))
                     )
                 )
b543f091
             } else {
                 # case with one file only. This is not at tar.gz
                 name <- manifest$filename
ed005a2a
                 message(
                     paste0(
                         "GDCdownload will download: ",
                         humanReadableByteCount(sum(as.numeric(manifest$size)))
                     )
                 )
b543f091
             }
 
ed005a2a
             server <- ifelse(
                 query$legacy,
                 "https://api.gdc.cancer.gov/legacy/data/",
                 "https://api.gdc.cancer.gov/data/"
             )
b543f091
 
ed005a2a
             if (is.null(files.per.chunk) & sum(as.numeric(manifest$size)) > 10^9) {
63f9b243
                 message("The total size of files is big. We will download files in chunks")
ed005a2a
                 files.per.chunk <- floor(10^9 / mean(as.numeric(manifest$size)))
63f9b243
             }
 
8f1bff6b
             if(is.null(files.per.chunk)) {
b543f091
                 message(paste0("Downloading as: ", name))
                 tryCatch({
                     GDCdownload.aux(server, manifest, name, path)
                 }, error = function(e) {
03de94e2
                     message("Download failed. We will retry with smaller chunks")
b543f091
                     # split in groups of 100 MB
480130c1
                     manifest <- checkAlreadyDownloaded(path,manifest)
b543f091
                     step <- ceiling(100000000/manifest$size[1])
                     if(step == 0) step <- 1
                     GDCdownload.by.chunk(server, manifest, name, path, step)
                 })
             } else {
8f1bff6b
                 step <- files.per.chunk
b543f091
                 # If error we will try another time.
                 tryCatch({
                     GDCdownload.by.chunk(server, manifest, name, path, step)
                 }, error = function(e) {
03de94e2
                     message("At least one of the chunks download was not correct. We will retry")
480130c1
                     manifest <- checkAlreadyDownloaded(path,manifest)
b543f091
                     GDCdownload.by.chunk(server, manifest, name, path, step)
                 })
             }
baf9aef5
         } else {
b543f091
             message("All samples have been already downloaded")
baf9aef5
         }
06f6f2b0
     }
178c3b9a
 }
 
1692d5bd
 #' @title Get a Manifest from GDCquery output that can be used with GDC-client
 #' @description
 #' Get a Manifest from GDCquery output that can be used with GDC-client
 #' @param query A query for GDCquery function
 #' @param save Write Manifest to a txt file (tab separated)
 #' @examples
15cdb167
 #' query <- GDCquery(
 #'   project = "TARGET-AML",
 #'   data.category = "Transcriptome Profiling",
 #'   data.type = "Gene Expression Quantification",
 #'   workflow.type = "STAR - Counts",
 #'   barcode = c("TARGET-20-PADZCG-04A-01R","TARGET-20-PARJCR-09A-01R")
 #'  )
1692d5bd
 #' getManifest(query)
 #' @export
b95c7f26
 getManifest <- function(query, save = FALSE) {
056ee6ed
 
411e4f8a
     manifest <- query$results[[1]][,c("file_id","file_name","md5sum","file_size","state")]
     colnames(manifest) <- c("id","filename","md5","size","state")
1692d5bd
     if(save)  {
         fname <- "gdc_manifest.txt"
ed005a2a
         readr::write_delim(manifest,fname,delim = "\t")
1692d5bd
         file <- file.path(getwd(),fname)
         message("Manifest saved as: ", file)
     }
411e4f8a
     return(manifest)
 }
 
056ee6ed
 GDCdownload.by.chunk <- function(
     server = "https://api.gdc.cancer.gov/data/",
     manifest,
     name = "TCGAbiolinks_download",
     path = ".",
     step = 1
 ){
4a3dad6c
     for(idx in 0:ceiling(nrow(manifest)/step - 1)){
         end <- ifelse(((idx + 1) * step) > nrow(manifest), nrow(manifest),((idx + 1) * step))
         manifest.aux <- manifest[((idx * step) + 1):end,]
         size <- humanReadableByteCount(sum(as.numeric(manifest.aux$size)))
9a408cdd
         name.aux <- gsub("\\.tar",paste0("_",idx,".tar"),name)
ed005a2a
         message(
             paste0(
                 "Downloading chunk ", idx + 1, " of ", ceiling(nrow(manifest)/step) ,
                 " (", nrow(manifest.aux)," files, size = ", size,") ",
                 "as ", name.aux
             )
         )
ce3aea19
         repeat {
269ccd2c
             ret <- GDCdownload.aux(server, manifest.aux, name.aux, path)
             if(ret == 1) break
ce3aea19
         }
4a3dad6c
     }
 }
 
056ee6ed
 GDCdownload.aux <- function(
     server = "https://api.gdc.cancer.gov/data/",
     manifest,
     name = "TCGAbiolinks_download",
     path = "."
 ){
baf9aef5
     result = tryCatch({
ed005a2a
         bin <- getURL(
             server,
             POST,
             body =  list(ids=list(manifest$id)),
             encode = "json",
             progress()
         )
269ccd2c
         if(bin[[2]] == "405"){
             message("ERROR accessing GDC. Trying again...")
09b42ff9
             bin <- getURL("https://api.gdc.cancer.gov/data/",
269ccd2c
                           POST,
                           body =  list(ids=list(manifest$id)),
                           encode = "json",
ed005a2a
                           progress()
             )
269ccd2c
         }
ed005a2a
         writeBin(getURL(bin,content, as = "raw",encoding = "UTF-8"), name)
baf9aef5
 
         if(nrow(manifest) > 1) {
             success <- untar(name)
             unlink(name) # remove tar
             if(success != 0){
                 stop("There was an error in the download process, please execute it again")
ce3aea19
                 return(-1)
baf9aef5
             }
         }
         # moving to project/source/data_category/data_type/file_id
         for(i in seq_along(manifest$filename)) {
             if(nrow(manifest) > 1) file <- file.path(manifest$id[i], manifest$filename[i])
             if(nrow(manifest) == 1) file <- file.path(manifest$filename[i])
             id <- manifest$id[i]
 
             # Check status
             if(!(md5sum(file) == manifest$md5[i])){
                 message(paste0("File corrupted:", file))
                 message("Run GDCdownload again to download it")
                 unlink(file)
                 next
             }
             if(nrow(manifest) > 1) {
                 move(file,file.path(path,file))
             }
9a408cdd
             if(nrow(manifest) == 1) {
                 move(file,file.path(path,id,file))
             }
baf9aef5
         }
         return(1)
     }, warning = function(w) {
9b3becdd
         return(1)
baf9aef5
     }, error = function(e) {
         unlink(name) # remove tar
         return(-1)
     })
ed005a2a
     if(result == -1) {
         stop(
             paste0(
                 "There was an error in the download process (we might had a connection problem with GDC server).",
                 "\nPlease run this function it again.",
                 "\nTry using method = `client` or setting files.per.chunk to a small number.")
         )
     }
baf9aef5
     message("Download completed")
 }
178c3b9a
 
94542aca
 humanReadableByteCount <- function(bytes) {
     unit <- 1000
     if (bytes < unit) return (paste0(bytes + " B"))
     exp <- floor(log(bytes) / log(unit))
     pre <- paste0(substr("KMGTPE",exp,exp))
     pre <- paste0(pre,"B")
     nb <- bytes / (unit ^ exp)
     return (paste(nb, pre))
 }
178c3b9a
 GDCclientPath <- function(){
     global <- Sys.which("gdc-client")
1a64d41a
     if (global != "") return(global)
94542aca
     local <- dir(pattern = "gdc-client*[^zip]$")
1a64d41a
     if (length(local) > 0) return(dir(pattern = "gdc-client*[^zip]$", full.names = TRUE))
178c3b9a
     return("")
 }
 
 GDCclientExists <- function(){
196aa217
     return(Sys.which("gdc-client.exe") != "" || Sys.which("gdc-client") != "" || length(dir(pattern = "gdc-client*[^zip]$") > 0))
178c3b9a
 }
2d421843
 #' @importFrom xml2 read_html
 #' @importFrom downloader download
6dcc778d
 #' @importFrom rvest html_nodes html_attr %>%
178c3b9a
 GDCclientInstall <- function(){
1a64d41a
     if (GDCclientExists()) return(GDCclientPath())
178c3b9a
 
164415d3
     links = tryCatch({
07b1ded5
         read_html("https://gdc.cancer.gov/access-data/gdc-data-transfer-tool")  %>% html_nodes("a") %>% html_attr("href")
164415d3
     }, error = function(e) {
9d640358
         c("https://gdc.cancer.gov/system/files/authenticated%20user/0/gdc-client_v1.4.0_Windows_x64.zip",
           "https://gdc.cancer.gov/system/files/authenticated%20user/0/gdc-client_v1.4.0_Ubuntu_x64.zip",
           "https://gdc.cancer.gov/system/files/authenticated%20user/0/gdc-client_v1.4.0_OSX_x64_10.12.6.zip")
164415d3
     })
d1f5d7ed
     bin <- links[grep("public.*zip",links)]
1a64d41a
     if (is.windows()) bin <- bin[grep("client*.*windows", bin,ignore.case = TRUE)]
     if (is.mac()) bin <- bin[grep("client*.*OSX", bin)]
     if (is.linux()) {
         if (grepl("ubuntu",Sys.info()["version"],ignore.case = TRUE)){
0f75978b
             bin <- bin[grep("client*.*Ubuntu", bin)]
         } else {
             bin <- bin[grep("client*.*Cent", bin)]
         }
     }
1a64d41a
     if (is.windows()) mode <- "wb" else  mode <- "w"
6dcc778d
     download(bin, basename(bin), mode = mode)
178c3b9a
     unzip(basename(bin))
     Sys.chmod("gdc-client")
9e876a88
     return(GDCclientPath())
178c3b9a
 }
cf2ebdfb
 
480130c1
 checkAlreadyDownloaded <- function(path,manifest){
0476d471
     files2Download <- !(file.exists(file.path(path,manifest$id,manifest$filename)) | file.exists(file.path(path,manifest$filename)))
1a64d41a
     if (any(files2Download == FALSE)) {
ed005a2a
         message(
             "Of the ", nrow(manifest), " files for download ",
             table(files2Download)["FALSE"] , " already exist."
         )
1a64d41a
         if (any(files2Download == TRUE)) message("We will download only those that are missing ones.")
480130c1
     }
     return(manifest[files2Download,])
 }