Browse code

add cleanup argument to loadStudy for untarred data

LiNk-NY authored on 03/12/2020 18:57:04
Showing 3 changed files

... ...
@@ -186,9 +186,9 @@ cbioportal2clinicaldf <- function(files) {
186 186
 #' @description **Note** that these functions should be used when a particular
187 187
 #' study is _not_ currently available as a `MultiAssayExperiment`
188 188
 #' representation. Otherwise, use `cBioDataPack`. Provide a `cancer_study_id`
189
-#' from the `studiesTable` and retrieve the study tarball from cBioPortal.
190
-#' These functions are used by `cBioDataPack` under the hood to download,
191
-#' untar, and load the tarball datasets with caching. As stated in
189
+#' from the `studiesTable` and retrieve the study tarball from the cBio
190
+#' Genomics Portal.  These functions are used by `cBioDataPack` under the hood
191
+#' to download,untar, and load the tarball datasets with caching. As stated in
192 192
 #' `?cBioDataPack`, not all studies are currently working as
193 193
 #' `MultiAssayExperiment` objects. As of July 2020, about ~80% of
194 194
 #' datasets can be successfully imported into the `MultiAssayExperiment` data
... ...
@@ -196,6 +196,13 @@ cbioportal2clinicaldf <- function(files) {
196 196
 #' study. You may also check `studiesTable$pack_build` for a more current
197 197
 #' status.
198 198
 #'
199
+#' @details When attempting to load a dataset using `loadStudy`, note that
200
+#' the `cleanup` argument is set to `TRUE` by default. Change the argument
201
+#' to `FALSE` if you would like to keep the untarred data in the `exdir`
202
+#' location. `downloadStudy` and `untarStudy` are not affected by this change.
203
+#' The tarball of the downloaded data is cached via `BiocFileCache` when
204
+#' `use_cache` is `TRUE`.
205
+#'
199 206
 #' @param cancer_study_id character(1) The study identifier from cBioPortal as
200 207
 #' in \url{https://cbioportal.org/webAPI}
201 208
 #'
... ...
@@ -203,16 +210,16 @@ cbioportal2clinicaldf <- function(files) {
203 210
 #' and use it to track downloaded data. If data found in the cache, data will
204 211
 #' not be re-downloaded. A path can also be provided to data cache location.
205 212
 #'
206
-#' @param force logical(1) (default FALSE) whether to force re-download data from
207
-#' remote location
213
+#' @param force logical(1) (default FALSE) whether to force re-download data
214
+#' from remote location
208 215
 #'
209 216
 #' @param url_location character(1)
210 217
 #' (default "https://cbioportal-datahub.s3.amazonaws.com") the URL location for
211 218
 #' downloading packaged data. Can be set using the 'cBio_URL' option (see
212 219
 #' `?cBioDataPack` for more details)
213 220
 #'
214
-#' @param names.field A character vector of possible column names for the column
215
-#' that is used to label ranges from a mutations or copy number file.
221
+#' @param names.field A character vector of possible column names for the
222
+#' column that is used to label ranges from a mutations or copy number file.
216 223
 #'
217 224
 #' @param cancer_study_file character(1) indicates the on-disk location
218 225
 #' of the downloaded tarball
... ...
@@ -223,6 +230,9 @@ cbioportal2clinicaldf <- function(files) {
223 230
 #' @param filepath character(1) indicates the folder location where
224 231
 #' the contents of the tarball are *located* (usually the same as `exdir`)
225 232
 #'
233
+#' @param cleanup logical(1) whether to delete the `untar`-red contents from
234
+#' the `exdir` folder (default TRUE)
235
+#'
226 236
 #' @return \itemize{
227 237
 #'   \item {downloadStudy - The file location of the data tarball}
228 238
 #'   \item {untarStudy - The directory location of the contents}
... ...
@@ -294,11 +304,14 @@ untarStudy <- function(cancer_study_file, exdir = tempdir()) {
294 304
 #' @rdname downloadStudy
295 305
 #'
296 306
 #' @export
297
-loadStudy <-
298
-    function(
299
-        filepath, names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene")
300
-    )
301
-{
307
+loadStudy <- function(
308
+    filepath,
309
+    names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene"),
310
+    cleanup = TRUE
311
+) {
312
+    if (cleanup)
313
+        on.exit(unlink(filepath, recursive = TRUE))
314
+
302 315
     datafiles <- getRelevantFilesFromStudy(
303 316
         list.files(filepath, recursive = TRUE)
304 317
     )
... ...
@@ -454,8 +467,9 @@ loadStudy <-
454 467
 #'
455 468
 #' @export
456 469
 cBioDataPack <- function(cancer_study_id, use_cache = TRUE,
457
-    names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene"), ask = TRUE) {
458
-
470
+    names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene"),
471
+    cleanup = TRUE, ask = TRUE)
472
+{
459 473
     denv <- new.env(parent = emptyenv())
460 474
     data("studiesTable", package = "cBioPortalData", envir = denv)
461 475
     studiesTable <- denv[["studiesTable"]]
... ...
@@ -481,6 +495,6 @@ cBioDataPack <- function(cancer_study_id, use_cache = TRUE,
481 495
 
482 496
     cancer_study_file <- downloadStudy(cancer_study_id, use_cache)
483 497
     exdir <- untarStudy(cancer_study_file)
484
-    loadStudy(exdir, names.field)
498
+    loadStudy(exdir, names.field, cleanup)
485 499
 }
486 500
 
... ...
@@ -9,6 +9,7 @@ cBioDataPack(
9 9
   cancer_study_id,
10 10
   use_cache = TRUE,
11 11
   names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene"),
12
+  cleanup = TRUE,
12 13
   ask = TRUE
13 14
 )
14 15
 }
... ...
@@ -23,6 +24,9 @@ not be re-downloaded. A path can also be provided to data cache location.}
23 24
 \item{names.field}{A character vector of possible column names for the column
24 25
 that is used to label ranges from a mutations or copy number file.}
25 26
 
27
+\item{cleanup}{logical(1) whether to delete the \code{untar}-red contents from
28
+the \code{exdir} folder (default TRUE)}
29
+
26 30
 \item{ask}{A logical vector of length one indicating whether to prompt the
27 31
 the user before downloading and loading study \code{MultiAssayExperiment}. If
28 32
 TRUE, the user will be prompted to continue for studies that are not
... ...
@@ -15,7 +15,11 @@ downloadStudy(
15 15
 
16 16
 untarStudy(cancer_study_file, exdir = tempdir())
17 17
 
18
-loadStudy(filepath, names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene"))
18
+loadStudy(
19
+  filepath,
20
+  names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene"),
21
+  cleanup = TRUE
22
+)
19 23
 }
20 24
 \arguments{
21 25
 \item{cancer_study_id}{character(1) The study identifier from cBioPortal as
... ...
@@ -25,8 +29,8 @@ in \url{https://cbioportal.org/webAPI}}
25 29
 and use it to track downloaded data. If data found in the cache, data will
26 30
 not be re-downloaded. A path can also be provided to data cache location.}
27 31
 
28
-\item{force}{logical(1) (default FALSE) whether to force re-download data from
29
-remote location}
32
+\item{force}{logical(1) (default FALSE) whether to force re-download data
33
+from remote location}
30 34
 
31 35
 \item{url_location}{character(1)
32 36
 (default "https://cbioportal-datahub.s3.amazonaws.com") the URL location for
... ...
@@ -42,8 +46,11 @@ the contents of the tarball (default \code{tempdir()}; see also \code{?untar})}
42 46
 \item{filepath}{character(1) indicates the folder location where
43 47
 the contents of the tarball are \emph{located} (usually the same as \code{exdir})}
44 48
 
45
-\item{names.field}{A character vector of possible column names for the column
46
-that is used to label ranges from a mutations or copy number file.}
49
+\item{names.field}{A character vector of possible column names for the
50
+column that is used to label ranges from a mutations or copy number file.}
51
+
52
+\item{cleanup}{logical(1) whether to delete the \code{untar}-red contents from
53
+the \code{exdir} folder (default TRUE)}
47 54
 }
48 55
 \value{
49 56
 \itemize{
... ...
@@ -56,9 +63,9 @@ that is used to label ranges from a mutations or copy number file.}
56 63
 \strong{Note} that these functions should be used when a particular
57 64
 study is \emph{not} currently available as a \code{MultiAssayExperiment}
58 65
 representation. Otherwise, use \code{cBioDataPack}. Provide a \code{cancer_study_id}
59
-from the \code{studiesTable} and retrieve the study tarball from cBioPortal.
60
-These functions are used by \code{cBioDataPack} under the hood to download,
61
-untar, and load the tarball datasets with caching. As stated in
66
+from the \code{studiesTable} and retrieve the study tarball from the cBio
67
+Genomics Portal.  These functions are used by \code{cBioDataPack} under the hood
68
+to download,untar, and load the tarball datasets with caching. As stated in
62 69
 \code{?cBioDataPack}, not all studies are currently working as
63 70
 \code{MultiAssayExperiment} objects. As of July 2020, about ~80\% of
64 71
 datasets can be successfully imported into the \code{MultiAssayExperiment} data
... ...
@@ -66,6 +73,14 @@ class. Please open an issue if you would like the team to prioritize a
66 73
 study. You may also check \code{studiesTable$pack_build} for a more current
67 74
 status.
68 75
 }
76
+\details{
77
+When attempting to load a dataset using \code{loadStudy}, note that
78
+the \code{cleanup} argument is set to \code{TRUE} by default. Change the argument
79
+to \code{FALSE} if you would like to keep the untarred data in the \code{exdir}
80
+location. \code{downloadStudy} and \code{untarStudy} are not affected by this change.
81
+The tarball of the downloaded data is cached via \code{BiocFileCache} when
82
+\code{use_cache} is \code{TRUE}.
83
+}
69 84
 \examples{
70 85
 
71 86
 (acc_file <- downloadStudy("acc_tcga"))