178c3b9a |
#' @title Download GDC data
#' @description
|
95f473a5 |
#' Uses GDC API or GDC transfer tool to download gdc data
#' The user can use query argument
|
178c3b9a |
#' The data from query will be save in a folder: project/data.category
#' @param query A query for GDCquery function
|
86722ee7 |
#' @param token.file Token file to download controlled data (only for method = "client")
|
14ebe74c |
#' @param method Uses the API (POST method) or gdc client tool. Options "api", "client".
|
c1b5e11c |
#' API is faster, but the data might get corrupted in the download, and it might need to be executed again
#' @param directory Directory/Folder where the data was downloaded. Default: GDCdata
|
8f1bff6b |
#' @param files.per.chunk This will make the API method only download n (files.per.chunk) files at a time.
#' This may reduce the download problems when the data size is too large. Expected a integer number (example files.per.chunk = 6)
|
95f473a5 |
#' @importFrom tools md5sum
#' @importFrom utils untar
|
0495b755 |
#' @import httr
|
68de40aa |
#' @importFrom methods is
|
178c3b9a |
#' @export
|
04368b05 |
#' @examples
|
8b20970c |
#' query <- GDCquery(
#' project = "TCGA-ACC",
#' data.category = "Copy number variation",
#' legacy = TRUE,
#' file.type = "hg19.seg",
#' barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01")
#' )
|
98cbbbc9 |
#' # data will be saved in GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation
|
04368b05 |
#' GDCdownload(query, method = "api")
|
68de40aa |
#' \dontrun{
|
cd1ab917 |
#' # Download clinical data from XML
#' query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical")
#' GDCdownload(query, files.per.chunk = 200)
|
15cdb167 |
#' query <- GDCquery(
#' project = "TARGET-AML",
#' data.category = "Transcriptome Profiling",
#' data.type = "miRNA Expression Quantification",
#' workflow.type = "BCGSC miRNA Profiling",
#' barcode = c("TARGET-20-PARUDL-03A-01R","TARGET-20-PASRRB-03A-01R")
#' )
|
411e4f8a |
#' # data will be saved in:
#' # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification
#' GDCdownload(query, method = "client", directory = "example_data_dir")
|
15cdb167 |
#' acc.gbm <- GDCquery(
#' project = c("TCGA-ACC","TCGA-GBM"),
#' data.category = "Transcriptome Profiling",
#' data.type = "Gene Expression Quantification",
#' workflow.type = "STAR - Counts"
#' )
#' GDCdownload(
#' query = acc.gbm,
#' method = "api",
#' directory = "example",
#' files.per.chunk = 50
#' )
|
c669f8e2 |
#' }
|
178c3b9a |
#' @return Shows the output from the GDC transfer tools
|
02c318ae |
#' @author Tiago Chedraoui Silva
|
ed005a2a |
GDCdownload <- function(
query,
token.file,
method = "api",
directory = "GDCdata",
files.per.chunk = NULL
) {
|
2ea5e186 |
isServeOK()
|
ed005a2a |
if (missing(query)) stop("Please set query argument")
|
c669f8e2 |
|
ed005a2a |
if (!(method %in% c("api","client"))) {
stop("method arguments possible values are: 'api' or 'client'")
}
if (length(unique(getResults(query)$data_type)) > 1) {
|
0276e37a |
print(knitr::kable(sort(unique(getResults(query)$data_type)),col.names = "data_type in query"))
|
ed005a2a |
stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.")
}
|
c1b5e11c |
|
dceb1d3e |
source <- ifelse(query$legacy,"legacy","harmonized")
|
c1b5e11c |
dir.create(directory, showWarnings = FALSE, recursive = TRUE)
|
b543f091 |
for(proj in unique(unlist(query$project))){
message("Downloading data for project ", proj)
query.aux <- query
|
6168c8c5 |
results <- getResults(query.aux)[getResults(query.aux)$project == proj,]
query.aux$results[[1]] <- results
|
1692d5bd |
manifest <- getManifest(query.aux)
|
6168c8c5 |
|
ed005a2a |
path <- unique(
file.path(
proj, source,
gsub(" ","_", results$data_category),
gsub(" ","_",results$data_type))
)
|
b543f091 |
path <- file.path(directory, path)
# Check if the files were already downloaded by this package
|
480130c1 |
manifest <- checkAlreadyDownloaded(path,manifest)
|
c1b5e11c |
|
b543f091 |
# There is a bug in the API, if the files has the same name it will not download correctly
# so method should be set to client if there are files with duplicated names
# However for clinical XML recurrent and primary are the same file. So we will ignore that case
|
ed005a2a |
if (nrow(manifest) > length(unique(manifest$filename))) method <- "client"
if (nrow(manifest) != 0 & method == "client") {
|
b543f091 |
# There exists two options to download the data, using the query or using a manifest file
# The second option was created to let users use legacy data or the API to search
|
9f1566aa |
|
b543f091 |
# This will find gdc clinet, if not installed it will install it
gdc.client.bin <- GDCclientInstall()
|
9e876a88 |
|
b543f091 |
# Using the query argument we will organize the files to the user
# Creates a file with the gdc manifest format
|
ed005a2a |
readr::write_delim(manifest,"gdc_manifest.txt",delim = "\t")
|
927f20fc |
|
a930c77f |
readr::write_delim(manifest,"gdc_client_configuration.dtt",delim = "\t")
readr::write_lines(
c("[download]","retry_amount = 6",paste0("dir =",path)),
file = "gdc_client_configuration.dtt"
)
cmd <- paste0(gdc.client.bin, " download -m gdc_manifest.txt --config gdc_client_configuration.dtt")
|
b543f091 |
|
a930c77f |
dir.create(path,recursive = TRUE,showWarnings = FALSE)
|
b543f091 |
if(!missing(token.file)) cmd <- paste0(cmd," -t ", token.file)
# Download all the files in the manifest using gdc client
message(paste0("GDCdownload will download: ",
humanReadableByteCount(sum(as.numeric(manifest$size)))))
message(paste0("Executing GDC client with the following command:\n",cmd))
result = tryCatch({
system(cmd)
}, warning = function(w) {
|
4a3dad6c |
}, error = function(e) {
})
|
b543f091 |
} else if (nrow(manifest) != 0 & method =="api"){
if(nrow(manifest) > 1) {
name <- paste0(gsub(" |:","_",date()),".tar.gz")
unlink(name)
|
ed005a2a |
message(
paste0(
"GDCdownload will download ", nrow(manifest), " files. A total of " ,
humanReadableByteCount(sum(as.numeric(manifest$size)))
)
)
|
b543f091 |
} else {
# case with one file only. This is not at tar.gz
name <- manifest$filename
|
ed005a2a |
message(
paste0(
"GDCdownload will download: ",
humanReadableByteCount(sum(as.numeric(manifest$size)))
)
)
|
b543f091 |
}
|
ed005a2a |
server <- ifelse(
query$legacy,
"https://api.gdc.cancer.gov/legacy/data/",
"https://api.gdc.cancer.gov/data/"
)
|
b543f091 |
|
ed005a2a |
if (is.null(files.per.chunk) & sum(as.numeric(manifest$size)) > 10^9) {
|
63f9b243 |
message("The total size of files is big. We will download files in chunks")
|
ed005a2a |
files.per.chunk <- floor(10^9 / mean(as.numeric(manifest$size)))
|
63f9b243 |
}
|
8f1bff6b |
if(is.null(files.per.chunk)) {
|
b543f091 |
message(paste0("Downloading as: ", name))
tryCatch({
GDCdownload.aux(server, manifest, name, path)
}, error = function(e) {
|
03de94e2 |
message("Download failed. We will retry with smaller chunks")
|
b543f091 |
# split in groups of 100 MB
|
480130c1 |
manifest <- checkAlreadyDownloaded(path,manifest)
|
b543f091 |
step <- ceiling(100000000/manifest$size[1])
if(step == 0) step <- 1
GDCdownload.by.chunk(server, manifest, name, path, step)
})
} else {
|
8f1bff6b |
step <- files.per.chunk
|
b543f091 |
# If error we will try another time.
tryCatch({
GDCdownload.by.chunk(server, manifest, name, path, step)
}, error = function(e) {
|
03de94e2 |
message("At least one of the chunks download was not correct. We will retry")
|
480130c1 |
manifest <- checkAlreadyDownloaded(path,manifest)
|
b543f091 |
GDCdownload.by.chunk(server, manifest, name, path, step)
})
}
|
baf9aef5 |
} else {
|
b543f091 |
message("All samples have been already downloaded")
|
baf9aef5 |
}
|
06f6f2b0 |
}
|
178c3b9a |
}
|
1692d5bd |
#' @title Get a Manifest from GDCquery output that can be used with GDC-client
#' @description
#' Get a Manifest from GDCquery output that can be used with GDC-client
#' @param query A query for GDCquery function
#' @param save Write Manifest to a txt file (tab separated)
#' @examples
|
15cdb167 |
#' query <- GDCquery(
#' project = "TARGET-AML",
#' data.category = "Transcriptome Profiling",
#' data.type = "Gene Expression Quantification",
#' workflow.type = "STAR - Counts",
#' barcode = c("TARGET-20-PADZCG-04A-01R","TARGET-20-PARJCR-09A-01R")
#' )
|
1692d5bd |
#' getManifest(query)
#' @export
|
b95c7f26 |
getManifest <- function(query, save = FALSE) {
|
056ee6ed |
|
411e4f8a |
manifest <- query$results[[1]][,c("file_id","file_name","md5sum","file_size","state")]
colnames(manifest) <- c("id","filename","md5","size","state")
|
1692d5bd |
if(save) {
fname <- "gdc_manifest.txt"
|
ed005a2a |
readr::write_delim(manifest,fname,delim = "\t")
|
1692d5bd |
file <- file.path(getwd(),fname)
message("Manifest saved as: ", file)
}
|
411e4f8a |
return(manifest)
}
|
056ee6ed |
GDCdownload.by.chunk <- function(
server = "https://api.gdc.cancer.gov/data/",
manifest,
name = "TCGAbiolinks_download",
path = ".",
step = 1
){
|
4a3dad6c |
for(idx in 0:ceiling(nrow(manifest)/step - 1)){
end <- ifelse(((idx + 1) * step) > nrow(manifest), nrow(manifest),((idx + 1) * step))
manifest.aux <- manifest[((idx * step) + 1):end,]
size <- humanReadableByteCount(sum(as.numeric(manifest.aux$size)))
|
9a408cdd |
name.aux <- gsub("\\.tar",paste0("_",idx,".tar"),name)
|
ed005a2a |
message(
paste0(
"Downloading chunk ", idx + 1, " of ", ceiling(nrow(manifest)/step) ,
" (", nrow(manifest.aux)," files, size = ", size,") ",
"as ", name.aux
)
)
|
ce3aea19 |
repeat {
|
269ccd2c |
ret <- GDCdownload.aux(server, manifest.aux, name.aux, path)
if(ret == 1) break
|
ce3aea19 |
}
|
4a3dad6c |
}
}
|
056ee6ed |
GDCdownload.aux <- function(
server = "https://api.gdc.cancer.gov/data/",
manifest,
name = "TCGAbiolinks_download",
path = "."
){
|
baf9aef5 |
result = tryCatch({
|
ed005a2a |
bin <- getURL(
server,
POST,
body = list(ids=list(manifest$id)),
encode = "json",
progress()
)
|
269ccd2c |
if(bin[[2]] == "405"){
message("ERROR accessing GDC. Trying again...")
|
09b42ff9 |
bin <- getURL("https://api.gdc.cancer.gov/data/",
|
269ccd2c |
POST,
body = list(ids=list(manifest$id)),
encode = "json",
|
ed005a2a |
progress()
)
|
269ccd2c |
}
|
ed005a2a |
writeBin(getURL(bin,content, as = "raw",encoding = "UTF-8"), name)
|
baf9aef5 |
if(nrow(manifest) > 1) {
success <- untar(name)
unlink(name) # remove tar
if(success != 0){
stop("There was an error in the download process, please execute it again")
|
ce3aea19 |
return(-1)
|
baf9aef5 |
}
}
# moving to project/source/data_category/data_type/file_id
for(i in seq_along(manifest$filename)) {
if(nrow(manifest) > 1) file <- file.path(manifest$id[i], manifest$filename[i])
if(nrow(manifest) == 1) file <- file.path(manifest$filename[i])
id <- manifest$id[i]
# Check status
if(!(md5sum(file) == manifest$md5[i])){
message(paste0("File corrupted:", file))
message("Run GDCdownload again to download it")
unlink(file)
next
}
if(nrow(manifest) > 1) {
move(file,file.path(path,file))
}
|
9a408cdd |
if(nrow(manifest) == 1) {
move(file,file.path(path,id,file))
}
|
baf9aef5 |
}
return(1)
}, warning = function(w) {
|
9b3becdd |
return(1)
|
baf9aef5 |
}, error = function(e) {
unlink(name) # remove tar
return(-1)
})
|
ed005a2a |
if(result == -1) {
stop(
paste0(
"There was an error in the download process (we might had a connection problem with GDC server).",
"\nPlease run this function it again.",
"\nTry using method = `client` or setting files.per.chunk to a small number.")
)
}
|
baf9aef5 |
message("Download completed")
}
|
178c3b9a |
|
94542aca |
humanReadableByteCount <- function(bytes) {
unit <- 1000
if (bytes < unit) return (paste0(bytes + " B"))
exp <- floor(log(bytes) / log(unit))
pre <- paste0(substr("KMGTPE",exp,exp))
pre <- paste0(pre,"B")
nb <- bytes / (unit ^ exp)
return (paste(nb, pre))
}
|
178c3b9a |
GDCclientPath <- function(){
global <- Sys.which("gdc-client")
|
1a64d41a |
if (global != "") return(global)
|
94542aca |
local <- dir(pattern = "gdc-client*[^zip]$")
|
1a64d41a |
if (length(local) > 0) return(dir(pattern = "gdc-client*[^zip]$", full.names = TRUE))
|
178c3b9a |
return("")
}
GDCclientExists <- function(){
|
196aa217 |
return(Sys.which("gdc-client.exe") != "" || Sys.which("gdc-client") != "" || length(dir(pattern = "gdc-client*[^zip]$") > 0))
|
178c3b9a |
}
|
2d421843 |
#' @importFrom xml2 read_html
#' @importFrom downloader download
|
6dcc778d |
#' @importFrom rvest html_nodes html_attr %>%
|
178c3b9a |
GDCclientInstall <- function(){
|
1a64d41a |
if (GDCclientExists()) return(GDCclientPath())
|
178c3b9a |
|
164415d3 |
links = tryCatch({
|
07b1ded5 |
read_html("https://gdc.cancer.gov/access-data/gdc-data-transfer-tool") %>% html_nodes("a") %>% html_attr("href")
|
164415d3 |
}, error = function(e) {
|
9d640358 |
c("https://gdc.cancer.gov/system/files/authenticated%20user/0/gdc-client_v1.4.0_Windows_x64.zip",
"https://gdc.cancer.gov/system/files/authenticated%20user/0/gdc-client_v1.4.0_Ubuntu_x64.zip",
"https://gdc.cancer.gov/system/files/authenticated%20user/0/gdc-client_v1.4.0_OSX_x64_10.12.6.zip")
|
164415d3 |
})
|
d1f5d7ed |
bin <- links[grep("public.*zip",links)]
|
1a64d41a |
if (is.windows()) bin <- bin[grep("client*.*windows", bin,ignore.case = TRUE)]
if (is.mac()) bin <- bin[grep("client*.*OSX", bin)]
if (is.linux()) {
if (grepl("ubuntu",Sys.info()["version"],ignore.case = TRUE)){
|
0f75978b |
bin <- bin[grep("client*.*Ubuntu", bin)]
} else {
bin <- bin[grep("client*.*Cent", bin)]
}
}
|
1a64d41a |
if (is.windows()) mode <- "wb" else mode <- "w"
|
6dcc778d |
download(bin, basename(bin), mode = mode)
|
178c3b9a |
unzip(basename(bin))
Sys.chmod("gdc-client")
|
9e876a88 |
return(GDCclientPath())
|
178c3b9a |
}
|
cf2ebdfb |
|
480130c1 |
checkAlreadyDownloaded <- function(path,manifest){
|
0476d471 |
files2Download <- !(file.exists(file.path(path,manifest$id,manifest$filename)) | file.exists(file.path(path,manifest$filename)))
|
1a64d41a |
if (any(files2Download == FALSE)) {
|
ed005a2a |
message(
"Of the ", nrow(manifest), " files for download ",
table(files2Download)["FALSE"] , " already exist."
)
|
1a64d41a |
if (any(files2Download == TRUE)) message("We will download only those that are missing ones.")
|
480130c1 |
}
return(manifest[files2Download,])
}
|