Browse code

Stating to remove legacy archive from TCGAbiolinks, since it will be shutdown by GDC

Tiago Silva authored on 06/05/2023 15:00:13
Showing 16 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: TCGAbiolinks
2 2
 Type: Package
3 3
 Title: TCGAbiolinks: An R/Bioconductor package for integrative analysis with GDC data
4
-Version: 2.29.0
4
+Version: 2.29.1
5 5
 Date: 2022-17-08
6 6
 Author: Antonio Colaprico,
7 7
     Tiago Chedraoui Silva,
... ...
@@ -1,3 +1,8 @@
1
+CHANGES IN VERSION 2.29.1
2
+-------------------------
3
+
4
+* Removing support to legacy archive since it will be shutdown by GDC soon.
5
+
1 6
 CHANGES IN VERSION 2.21.1
2 7
 -------------------------
3 8
 
... ...
@@ -344,13 +344,16 @@ GDCquery_clinic <- function(
344 344
                             } else {
345 345
                                 # HTMCP-03-06-02061 has two diagnosis
346 346
                                 x$submitter_id <- gsub("_diagnosis.*","",x$submitter_id)
347
+                                # If there are two rows for the same submitter_id
348
+                                # we will collapse them into one single row
349
+                                # concatanating all columns using ;
347 350
                                 aux <- x %>% dplyr::group_by(submitter_id) %>%
348
-                                    dplyr::summarise_each(funs(paste(unique(.), collapse = ";")))
351
+                                    summarise(across(everything(),~ paste(unique(.), collapse = ";")))
349 352
                                 aux$treatments <- list(dplyr::bind_rows(x$treatments))
350 353
                                 aux
351 354
                             }
352 355
                         }
353
-                    ),fill = T
356
+                    ), fill = TRUE
354 357
                 )
355 358
                 #df$submitter_id <- gsub("^d|_diagnosis|diag-|-DX|-DIAG|-diagnosis","", df$submitter_id)
356 359
                 # ^d ORGANOID-PANCREATIC
... ...
@@ -500,7 +503,7 @@ GDCprepare_clinic <- function(
500 503
     }
501 504
 
502 505
     # Get all the clincal xml files
503
-    source <- ifelse(query$legacy,"legacy","harmonized")
506
+    source <- "harmonized"
504 507
     files <- file.path(
505 508
         query$results[[1]]$project, source,
506 509
         gsub(" ","_",query$results[[1]]$data_category),
... ...
@@ -16,15 +16,6 @@
16 16
 #' @importFrom methods is
17 17
 #' @export
18 18
 #' @examples
19
-#' query <- GDCquery(
20
-#'   project = "TCGA-ACC",
21
-#'   data.category =  "Copy number variation",
22
-#'   legacy = TRUE,
23
-#'   file.type = "hg19.seg",
24
-#'   barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01")
25
-#'  )
26
-#' # data will be saved in  GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation
27
-#' GDCdownload(query, method = "api")
28 19
 #' \dontrun{
29 20
 #'     # Download clinical data from XML
30 21
 #'     query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical")
... ...
@@ -39,14 +30,14 @@
39 30
 #'     # data will be saved in:
40 31
 #'     # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification
41 32
 #'     GDCdownload(query, method = "client", directory = "example_data_dir")
42
-#'     acc.gbm <- GDCquery(
33
+#'     query_acc_gbm <- GDCquery(
43 34
 #'         project =  c("TCGA-ACC","TCGA-GBM"),
44 35
 #'         data.category = "Transcriptome Profiling",
45 36
 #'         data.type = "Gene Expression Quantification",
46 37
 #'         workflow.type = "STAR - Counts"
47 38
 #'     )
48 39
 #'     GDCdownload(
49
-#'        query = acc.gbm,
40
+#'        query = query_acc_gbm,
50 41
 #'        method = "api",
51 42
 #'        directory = "example",
52 43
 #'        files.per.chunk = 50
... ...
@@ -73,7 +64,7 @@ GDCdownload <- function(
73 64
         stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.")
74 65
     }
75 66
 
76
-    source <- ifelse(query$legacy,"legacy","harmonized")
67
+    source <- "harmonized"
77 68
 
78 69
     dir.create(directory, showWarnings = FALSE, recursive = TRUE)
79 70
     for(proj in unique(unlist(query$project))){
... ...
@@ -152,11 +143,7 @@ GDCdownload <- function(
152 143
                 )
153 144
             }
154 145
 
155
-            server <- ifelse(
156
-                query$legacy,
157
-                "https://api.gdc.cancer.gov/legacy/data/",
158
-                "https://api.gdc.cancer.gov/data/"
159
-            )
146
+            server <- "https://api.gdc.cancer.gov/data/"
160 147
 
161 148
             if (is.null(files.per.chunk) & sum(as.numeric(manifest$size)) > 10^9) {
162 149
                 message("The total size of files is big. We will download files in chunks")
... ...
@@ -67,107 +67,52 @@ checkProjectInput <- function(project){
67 67
     }
68 68
 }
69 69
 
70
-checkLegacyPlatform <- function(project,data.category, legacy = FALSE){
71
-    project.summary <- getProjectSummary(project, legacy)
72
-    if(missing(data.category)) {
73
-        print(knitr::kable(project.summary$data_categories))
74
-        stop("Please set a data.category argument from the column data_category above")
75
-    }
76
-    if(!(data.category %in% project.summary$data_categories$data_category)) {
77
-        print(knitr::kable(project.summary$data_categories))
78
-        stop("Please set a valid data.category argument from the column data_category above")
79
-    }
80
-}
70
+checkDataTypeInput <- function(data.type){
71
+
72
+    harmonized.data.type <- c(
73
+        "Aggregated Somatic Mutation",
74
+        "Aligned Reads",
75
+        "Gene Expression Quantification",
76
+        "Raw CGI Variant",
77
+        "Methylation Beta Value",
78
+        "Differential Gene Expression",
79
+        "Splice Junction Quantification",
80
+        "Protein Expression Quantification",
81
+        "Annotated Somatic Mutation",
82
+        "Raw Simple Somatic Mutation",
83
+        "Masked Somatic Mutation",
84
+        "Copy Number Segment",
85
+        "Masked Intensities",
86
+        "Allele-specific Copy Number Segment",
87
+        "Masked Copy Number Segment",
88
+        "Isoform Expression Quantification",
89
+        "miRNA Expression Quantification",
90
+        "Gene Level Copy Number",
91
+        "Biospecimen Supplement",
92
+        "Gene Level Copy Number Scores",
93
+        "Protein Expression Quantification",
94
+        "Clinical Supplement",
95
+        "Single Cell Analysis",
96
+        "Masked Somatic Mutation",
97
+        "Slide Image"
98
+    )
81 99
 
82
-checkDataTypeInput <- function(legacy, data.type){
83
-    if(legacy){
84
-        legacy.data.type <- c("Copy number segmentation",
85
-                              "Raw intensities",
86
-                              "Aligned reads",
87
-                              "Copy number estimate",
88
-                              "Simple nucleotide variation",
89
-                              "Gene expression quantification",
90
-                              "Coverage WIG",
91
-                              "miRNA gene quantification",
92
-                              "Genotypes",
93
-                              "miRNA isoform quantification",
94
-                              "Normalized copy numbers",
95
-                              "Isoform expression quantification",
96
-                              "Normalized intensities",
97
-                              "Tissue slide image",
98
-                              "Exon quantification",
99
-                              "Exon junction quantification",
100
-                              "Methylation beta value",
101
-                              "Unaligned reads",
102
-                              "Diagnostic image",
103
-                              "CGH array QC",
104
-                              "Biospecimen Supplement",
105
-                              "Pathology report",
106
-                              "Clinical Supplement",
107
-                              "Intensities",
108
-                              "Protein expression quantification",
109
-                              "Microsatellite instability",
110
-                              "Structural variation",
111
-                              "Auxiliary test",
112
-                              "Copy number QC metrics",
113
-                              "Intensities Log2Ratio",
114
-                              "Methylation array QC metrics",
115
-                              "Clinical data",
116
-                              "Copy number variation",
117
-                              "ABI sequence trace",
118
-                              "Protein Expression Quantification",
119
-                              "Biospecimen data",
120
-                              "Simple somatic mutation",
121
-                              "Bisulfite sequence alignment",
122
-                              "Methylation percentage",
123
-                              "Sequencing tag",
124
-                              "Sequencing tag counts",
125
-                              "LOH")
126
-        if(!data.type %in% legacy.data.type) {
127
-            print(knitr::kable(as.data.frame(sort(legacy.data.type))))
128
-            stop("Please set a data.type argument from the column legacy.data.type above")
129
-        }
130
-    } else {
131
-        harmonized.data.type <- c(
132
-            "Aggregated Somatic Mutation",
133
-            "Aligned Reads",
134
-            "Gene Expression Quantification",
135
-            "Raw CGI Variant",
136
-            "Methylation Beta Value",
137
-            "Differential Gene Expression",
138
-            "Splice Junction Quantification",
139
-            "Protein Expression Quantification",
140
-            "Annotated Somatic Mutation",
141
-            "Raw Simple Somatic Mutation",
142
-            "Masked Somatic Mutation",
143
-            "Copy Number Segment",
144
-            "Masked Intensities",
145
-            "Allele-specific Copy Number Segment",
146
-            "Masked Copy Number Segment",
147
-            "Isoform Expression Quantification",
148
-            "miRNA Expression Quantification",
149
-            "Gene Level Copy Number",
150
-            "Biospecimen Supplement",
151
-            "Gene Level Copy Number Scores",
152
-            "Protein Expression Quantification",
153
-            "Clinical Supplement",
154
-            "Single Cell Analysis",
155
-            "Masked Somatic Mutation",
156
-            "Slide Image")
157
-        if(!data.type %in% harmonized.data.type) {
158
-            print(knitr::kable(as.data.frame(sort(harmonized.data.type))))
159
-            stop("Please set a data.type argument from the column harmonized.data.type above")
160
-        }
100
+    if (!data.type %in% harmonized.data.type) {
101
+        print(knitr::kable(as.data.frame(sort(harmonized.data.type))))
102
+        stop("Please set a data.type argument from the column harmonized.data.type above")
161 103
     }
162 104
 }
163 105
 
164
-checkDataCategoriesInput <- function(project,data.category, legacy = FALSE){
106
+checkDataCategoriesInput <- function(project,data.category){
107
+
165 108
     for(proj in project){
166
-        project.summary <- getProjectSummary(proj, legacy)
109
+
110
+        project.summary <- getProjectSummary(proj)
167 111
         if(missing(data.category)) {
168 112
             print(knitr::kable(project.summary$data_categories))
169 113
             stop("Please set a data.category argument from the column data_category above")
170 114
         }
115
+
171 116
         if(!(data.category %in% project.summary$data_categories$data_category)) {
172 117
             print(knitr::kable(project.summary$data_categories))
173 118
             stop("Please set a valid data.category argument from the column data_category above. We could not validade the data.category for project ", proj)
... ...
@@ -618,13 +563,10 @@ get.mutation <- function(
618 563
     if(missing(genes)) stop("Argument genes is missing")
619 564
 
620 565
     # Get mutation annotation file
621
-    library(maftools)
622
-    library(dplyr)
623 566
     query <- GDCquery(
624 567
         project = project,
625 568
         data.category = "Simple Nucleotide Variation",
626 569
         access = "open",
627
-        legacy = FALSE,
628 570
         data.type = "Masked Somatic Mutation",
629 571
         workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
630 572
     )
... ...
@@ -638,8 +580,9 @@ get.mutation <- function(
638 580
         unlist(
639 581
             sapply(
640 582
                 mutant_variant_classification,
641
-                function(x) grep(x,maf$Variant_Classification,
642
-                                 ignore.case = TRUE)
583
+                function(x) {
584
+                    grep(x,maf$Variant_Classification,ignore.case = TRUE)
585
+                }
643 586
             )
644 587
         )
645 588
     )
... ...
@@ -648,8 +591,10 @@ get.mutation <- function(
648 591
     mut <- NULL
649 592
     for(i in genes) {
650 593
         if(!i %in% maf$Hugo_Symbol) next
651
-        aux <- data.frame(patient = substr(unique(maf[i == maf$Hugo_Symbol,]$Tumor_Sample_Barcode),1,15),
652
-                          mut = TRUE)
594
+        aux <- data.frame(
595
+            patient = substr(unique(maf[i == maf$Hugo_Symbol,]$Tumor_Sample_Barcode),1,15),
596
+            mut = TRUE
597
+        )
653 598
         colnames(aux)[2] <- paste0("mut_hg38_",i)
654 599
         if(is.null(mut)) {
655 600
             mut <- aux
... ...
@@ -668,6 +613,7 @@ get.mutation <- function(
668 613
 
669 614
     return(mut)
670 615
 }
616
+
671 617
 get.mut.gistc <- function(
672 618
         project,
673 619
         genes,
... ...
@@ -694,6 +640,7 @@ get.mut.gistc <- function(
694 640
     } else if(is.null(mut) & !is.null(cnv)) {
695 641
         return(cnv)
696 642
     }
643
+
697 644
     return(NULL)
698 645
 }
699 646
 get.mut.gistc.information <- function(
... ...
@@ -91,7 +91,7 @@ GDCprepare <- function(
91 91
         stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost")
92 92
     }
93 93
     # We save the files in project/source/data.category/data.type/file_id/file_name
94
-    source <- ifelse(query$legacy,"legacy","harmonized")
94
+    source <- "harmonized"
95 95
     files <- file.path(
96 96
         query$results[[1]]$project, source,
97 97
         gsub(" ","_",query$results[[1]]$data_category),
... ...
@@ -174,8 +174,7 @@ GDCprepare <- function(
174 174
             files = files,
175 175
             cases = cases,
176 176
             summarizedExperiment = summarizedExperiment,
177
-            platform =  unique(query$results[[1]]$platform),
178
-            legacy = query$legacy
177
+            platform =  unique(query$results[[1]]$platform)
179 178
         )
180 179
     }  else if (grepl("Raw intensities|Masked Intensities",query$data.type, ignore.case = TRUE)) {
181 180
         # preparing IDAT files
... ...
@@ -183,8 +182,7 @@ GDCprepare <- function(
183 182
             files = files,
184 183
             barcode = cases,
185 184
             summarizedExperiment = summarizedExperiment,
186
-            platform =  unique(query$results[[1]]$platform),
187
-            legacy = query$legacy
185
+            platform =  unique(query$results[[1]]$platform)
188 186
         )
189 187
     }  else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) {
190 188
 
... ...
@@ -199,7 +197,7 @@ GDCprepare <- function(
199 197
 
200 198
     }  else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) {
201 199
 
202
-        if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE) | source == "legacy"){
200
+        if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){
203 201
             data <- readSimpleNucleotideVariationMaf(files)
204 202
         }
205 203
 
... ...
@@ -212,7 +210,7 @@ GDCprepare <- function(
212 210
                 files = files,
213 211
                 cases = cases,
214 212
                 summarizedExperiment = summarizedExperiment,
215
-                genome = ifelse(query$legacy,"hg19","hg38"),
213
+                genome = "hg38",
216 214
                 experimental.strategy = unique(query$results[[1]]$experimental_strategy)
217 215
             )
218 216
 
... ...
@@ -221,7 +219,7 @@ GDCprepare <- function(
221 219
                 files = files,
222 220
                 cases = cases,
223 221
                 summarizedExperiment = FALSE,
224
-                genome = ifelse(query$legacy,"hg19","hg38"),
222
+                genome = "hg38",
225 223
                 experimental.strategy = unique(query$results[[1]]$experimental_strategy)
226 224
             )
227 225
 
... ...
@@ -713,14 +711,13 @@ readIDATDNAmethylation <- function(
713 711
         files,
714 712
         barcode,
715 713
         summarizedExperiment,
716
-        platform,
717
-        legacy
714
+        platform
718 715
 ) {
719 716
 
720 717
     check_package("sesame")
721 718
 
722 719
     # Check if moved files would be moved outside of scope folder, if so, path doesn't change
723
-    moved.files <- sapply(files,USE.NAMES=FALSE,function(x){
720
+    moved.files <- sapply(files,USE.NAMES = FALSE,function(x){
724 721
         if (grepl("Raw_intensities|Masked_Intensities",dirname(dirname(x)))) {
725 722
             return(file.path(dirname(dirname(x)), basename(x)))
726 723
         }
... ...
@@ -753,7 +750,7 @@ readIDATDNAmethylation <- function(
753 750
 
754 751
         betas <- makeSEFromDNAMethylationMatrix(
755 752
             betas = betas,
756
-            genome = ifelse(legacy,"hg19","hg38"),
753
+            genome ="hg38",
757 754
             met.platform = platform
758 755
         )
759 756
         colData(betas) <- DataFrame(colDataPrepare(colnames(betas)))
... ...
@@ -774,8 +771,7 @@ readDNAmethylation <- function(
774 771
         files,
775 772
         cases,
776 773
         summarizedExperiment = TRUE,
777
-        platform,
778
-        legacy
774
+        platform
779 775
 ){
780 776
     if(length(platform) > 1){
781 777
 
... ...
@@ -847,7 +843,7 @@ readDNAmethylation <- function(
847 843
 
848 844
             df <- makeSEFromDNAMethylationMatrix(
849 845
                 betas = df,
850
-                genome = ifelse(legacy,"hg19","hg38"),
846
+                genome = "hg38",
851 847
                 met.platform = platform
852 848
             )
853 849
         }
... ...
@@ -1056,31 +1052,37 @@ colDataPrepareTCGA <- function(barcode){
1056 1052
     # For the moment this will work only for TCGA Data
1057 1053
     # We should search what TARGET data means
1058 1054
 
1059
-    code <- c('01','02','03','04','05','06','07','08','09','10','11',
1060
-              '12','13','14','20','40','50','60','61')
1061
-    shortLetterCode <- c("TP","TR","TB","TRBM","TAP","TM","TAM","THOC",
1062
-                         "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB",
1063
-                         "CELL","XP","XCL")
1064
-
1065
-    definition <- c("Primary solid Tumor", # 01
1066
-                    "Recurrent Solid Tumor", # 02
1067
-                    "Primary Blood Derived Cancer - Peripheral Blood", # 03
1068
-                    "Recurrent Blood Derived Cancer - Bone Marrow", # 04
1069
-                    "Additional - New Primary", # 05
1070
-                    "Metastatic", # 06
1071
-                    "Additional Metastatic", # 07
1072
-                    "Human Tumor Original Cells", # 08
1073
-                    "Primary Blood Derived Cancer - Bone Marrow", # 09
1074
-                    "Blood Derived Normal", # 10
1075
-                    "Solid Tissue Normal",  # 11
1076
-                    "Buccal Cell Normal",   # 12
1077
-                    "EBV Immortalized Normal", # 13
1078
-                    "Bone Marrow Normal", # 14
1079
-                    "Control Analyte", # 20
1080
-                    "Recurrent Blood Derived Cancer - Peripheral Blood", # 40
1081
-                    "Cell Lines", # 50
1082
-                    "Primary Xenograft Tissue", # 60
1083
-                    "Cell Line Derived Xenograft Tissue") # 61
1055
+    code <- c(
1056
+        '01','02','03','04','05','06','07','08','09','10','11',
1057
+        '12','13','14','20','40','50','60','61'
1058
+    )
1059
+    shortLetterCode <- c(
1060
+        "TP","TR","TB","TRBM","TAP","TM","TAM","THOC",
1061
+        "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB",
1062
+        "CELL","XP","XCL"
1063
+    )
1064
+
1065
+    definition <- c(
1066
+        "Primary solid Tumor", # 01
1067
+        "Recurrent Solid Tumor", # 02
1068
+        "Primary Blood Derived Cancer - Peripheral Blood", # 03
1069
+        "Recurrent Blood Derived Cancer - Bone Marrow", # 04
1070
+        "Additional - New Primary", # 05
1071
+        "Metastatic", # 06
1072
+        "Additional Metastatic", # 07
1073
+        "Human Tumor Original Cells", # 08
1074
+        "Primary Blood Derived Cancer - Bone Marrow", # 09
1075
+        "Blood Derived Normal", # 10
1076
+        "Solid Tissue Normal",  # 11
1077
+        "Buccal Cell Normal",   # 12
1078
+        "EBV Immortalized Normal", # 13
1079
+        "Bone Marrow Normal", # 14
1080
+        "Control Analyte", # 20
1081
+        "Recurrent Blood Derived Cancer - Peripheral Blood", # 40
1082
+        "Cell Lines", # 50
1083
+        "Primary Xenograft Tissue", # 60
1084
+        "Cell Line Derived Xenograft Tissue"
1085
+    ) # 61
1084 1086
     aux <- DataFrame(code = code,shortLetterCode,definition)
1085 1087
 
1086 1088
     # in case multiple equal barcode
... ...
@@ -1088,10 +1090,12 @@ colDataPrepareTCGA <- function(barcode){
1088 1090
                     "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}")
1089 1091
     samples <- str_match(barcode,regex)[,1]
1090 1092
 
1091
-    ret <- DataFrame(barcode = barcode,
1092
-                     patient = substr(barcode, 1, 12),
1093
-                     sample = substr(barcode, 1, 16),
1094
-                     code = substr(barcode, 14, 15))
1093
+    ret <- DataFrame(
1094
+        barcode = barcode,
1095
+        patient = substr(barcode, 1, 12),
1096
+        sample = substr(barcode, 1, 16),
1097
+        code = substr(barcode, 14, 15)
1098
+    )
1095 1099
     ret <- merge(ret,aux, by = "code", sort = FALSE)
1096 1100
     ret <- ret[match(barcode,ret$barcode),]
1097 1101
     rownames(ret) <- gsub("\\.","-",make.names(ret$barcode,unique=TRUE))
... ...
@@ -3,7 +3,6 @@
3 3
 #'   Uses GDC API to search for search, it searches for both controlled and
4 4
 #'   open-access data.
5 5
 #'   For GDC data arguments project, data.category, data.type and workflow.type should be used
6
-#'   For the legacy data arguments project, data.category, platform and/or file.extension should be used.
7 6
 #'   Please, see the vignette for a table with the possibilities.
8 7
 #' @param project A list of valid project (see list with TCGAbiolinks:::getGDCprojects()$project_id)]
9 8
 #' \itemize{
... ...
@@ -75,33 +74,15 @@
75 74
 #' \item{ Simple Nucleotide Variation }
76 75
 #' \item{ Transcriptome Profiling }
77 76
 #' }
78
-#' List for legacy archive
79
-#' \itemize{
80
-#' \item{ Biospecimen }
81
-#' \item{ Clinical }
82
-#' \item{ Copy number variation }
83
-#' \item{ DNA methylation }
84
-#' \item{ Gene expression }
85
-#' \item{ Protein expression }
86
-#' \item{ Raw microarray data }
87
-#' \item{ Raw sequencing data }
88
-#' \item{ Simple nucleotide variation }
89
-#' }
90 77
 #' @param data.type A data type to filter the files to download
91 78
 #' For the complete list please check the vignette.
92 79
 #' @param sample.type A sample type to filter the files to download
93 80
 #' @param barcode A list of barcodes to filter the files to download
94
-#' @param legacy Search in the legacy repository
95 81
 #' @param data.format Data format filter ("VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML",
96 82
 #' "TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX")
97
-#' @param file.type To be used in the legacy database for some platforms,
98
-#' to define which file types to be used.
99 83
 #' @param workflow.type GDC workflow type
100
-#' @param experimental.strategy Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.
101
-#' Legacy:  WXS, RNA-Seq, miRNA-Seq, Genotyping Array,
102
-#' DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS,
103
-#' MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array,
104
-#' Total RNA-Seq, Capillary sequencing, Bisulfite-Seq
84
+#' @param experimental.strategy Filter to experimental strategy.
85
+#' Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.
105 86
 #' @param access Filter by access type. Possible values: controlled, open
106 87
 #' @param platform Example:
107 88
 #' \tabular{ll}{
... ...
@@ -157,19 +138,6 @@
157 138
 #'    data.type = "Masked Copy Number Segment",
158 139
 #'    sample.type = c("Primary Tumor")
159 140
 #' )
160
-#' query.met <- GDCquery(
161
-#'    project = c("TCGA-GBM","TCGA-LGG"),
162
-#'    legacy = TRUE,
163
-#'    data.category = "DNA methylation",
164
-#'    platform = "Illumina Human Methylation 450"
165
-#' )
166
-#' query <- GDCquery(
167
-#'    project = "TCGA-ACC",
168
-#'    data.category =  "Copy number variation",
169
-#'    legacy = TRUE,
170
-#'    file.type = "hg19.seg",
171
-#'    barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")
172
-#' )
173 141
 #' }
174 142
 #' @return A data frame with the results and the parameters used
175 143
 #' @importFrom jsonlite fromJSON
... ...
@@ -183,7 +151,6 @@ GDCquery <- function(
183 151
         data.category,
184 152
         data.type,
185 153
         workflow.type,
186
-        legacy = FALSE,
187 154
         access,
188 155
         platform,
189 156
         file.type,
... ...
@@ -243,11 +210,11 @@ GDCquery <- function(
243 210
         }
244 211
     })
245 212
     print.header("GDCquery: Searching in GDC database","section")
246
-    message("Genome of reference: ",ifelse(legacy,"hg19","hg38"))
213
+    message("Genome of reference: hg38")
247 214
     # Check arguments
248 215
     checkProjectInput(project)
249
-    checkDataCategoriesInput(project, data.category, legacy)
250
-    if(!is.na(data.type)) checkDataTypeInput(legacy = legacy, data.type = data.type)
216
+    checkDataCategoriesInput(project, data.category)
217
+    if(!is.na(data.type)) checkDataTypeInput(data.type = data.type)
251 218
     if(!any(is.na(sample.type))) checkBarcodeDefinition(sample.type)
252 219
 
253 220
     results <- NULL
... ...
@@ -257,7 +224,6 @@ GDCquery <- function(
257 224
             project = proj,
258 225
             data.category = data.category,
259 226
             data.type = data.type,
260
-            legacy = legacy,
261 227
             workflow.type = workflow.type,
262 228
             platform = platform,
263 229
             file.type = file.type,
... ...
@@ -279,7 +245,6 @@ GDCquery <- function(
279 245
                 project = proj,
280 246
                 data.category = data.category,
281 247
                 data.type = data.type,
282
-                legacy = legacy,
283 248
                 workflow.type = NA,
284 249
                 platform = NA,
285 250
                 file.type = file.type,
... ...
@@ -621,17 +586,6 @@ GDCquery <- function(
621 586
         message("ooo By sample.type")
622 587
         results <- results[tolower(results$sample_type) %in% tolower(sample.type),]
623 588
     }
624
-    # some how there are duplicated files in GDC we should remove them
625
-    # Example of problematic query
626
-    # query.exp <- GDCquery(project = "TCGA-BRCA",
627
-    #                  legacy = TRUE,
628
-    #                  data.category = "Gene expression",
629
-    #                  data.type = "Gene expression quantification",
630
-    #                  platform = "Illumina HiSeq",
631
-    #                  file.type = "results",
632
-    #                  experimental_strategy = "RNA-Seq",
633
-    #                  sample.type = c("Primary solid Tumor","Solid Tissue Normal"))
634
-    #
635 589
     print.header("Checking data","subsection")
636 590
 
637 591
     message("ooo Checking if there are duplicated cases")
... ...
@@ -665,7 +619,6 @@ GDCquery <- function(
665 619
         project = I(list(project)),
666 620
         data.category = data.category,
667 621
         data.type = data.type,
668
-        legacy = legacy,
669 622
         access = I(list(access)),
670 623
         experimental.strategy =  I(list(experimental.strategy)),
671 624
         file.type = file.type,
... ...
@@ -677,37 +630,41 @@ GDCquery <- function(
677 630
     return(ret)
678 631
 }
679 632
 
680
-getGDCquery <- function(project, data.category, data.type, legacy, workflow.type,platform,file.type,files.access,sample.type,experimental.strategy){
633
+getGDCquery <- function(
634
+        project,
635
+        data.category,
636
+        data.type,
637
+        workflow.type,
638
+        platform,
639
+        file.type,
640
+        files.access,
641
+        sample.type,
642
+        experimental.strategy
643
+){
681 644
     # Get manifest using the API
682
-    baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/files/?","https://api.gdc.cancer.gov/files/?")
645
+    baseURL <- "https://api.gdc.cancer.gov/files/?"
683 646
     options.pretty <- "pretty=true"
684
-    if(data.category == "Protein expression" & legacy) {
685
-        options.expand <- "fields=archive.revision,archive.file_name,md5sum,state,data_category,file_id,platform,file_name,file_size,md5sum,submitter_id,data_type&expand=cases.samples.portions,cases.project,center,analysis"
686
-    } else if(data.category %in% c("Clinical","Biospecimen")) {
647
+    if(data.category %in% c("Clinical","Biospecimen")) {
687 648
         options.expand <- "expand=cases,cases.project,center,analysis"
688 649
     } else {
689 650
         options.expand <- "expand=cases,cases.samples.portions.analytes.aliquots,cases.project,center,analysis,cases.samples"
690 651
     }
691
-    option.size <- paste0("size=",getNbFiles(project,data.category,legacy))
652
+    option.size <- paste0("size=",getNbFiles(project,data.category))
692 653
     option.format <- paste0("format=JSON")
693 654
 
694
-    options.filter <- paste0("filters=",
695
-                             URLencode('{"op":"and","content":['),  # Start json request
696
-                             URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'),
697
-                             project,
698
-                             URLencode('"]}}'))
655
+    options.filter <- paste0(
656
+        "filters=",
657
+        URLencode('{"op":"and","content":['),  # Start json request
658
+        URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'),
659
+        project,
660
+        URLencode('"]}}')
661
+    )
699 662
 
700
-    if(!is.na(experimental.strategy))  options.filter <- paste0(options.filter,addFilter("files.experimental_strategy", experimental.strategy))
663
+    if(!is.na(experimental.strategy)) options.filter <- paste0(options.filter,addFilter("files.experimental_strategy", experimental.strategy))
701 664
     if(!is.na(data.category))  options.filter <- paste0(options.filter,addFilter("files.data_category", data.category))
702 665
     if(!is.na(data.type))  options.filter <- paste0(options.filter,addFilter("files.data_type", data.type))
703 666
     if(!is.na(workflow.type))  options.filter <- paste0(options.filter,addFilter("files.analysis.workflow_type", workflow.type))
704 667
     if(!any(is.na(platform))) options.filter <- paste0(options.filter,addFilter("files.platform", platform))
705
-    if(!any(is.na(file.type))) {
706
-        if(file.type == "results" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "unnormalized"))
707
-        if(file.type == "normalized_results" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "normalized"))
708
-        if(file.type == "nocnv_hg19.seg" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "nocnv"))
709
-        if(file.type == "hg19.isoform" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "hg19"))
710
-    }
711 668
     if(!any(is.na(files.access))) {
712 669
         options.filter <- paste0(options.filter,addFilter("files.access", files.access))
713 670
     }
... ...
@@ -1028,12 +985,11 @@ GDCquery_ATAC_seq <- function(
1028 985
     results$data_category <- "ATAC-seq"
1029 986
     results$project <- "ATAC-seq"
1030 987
     ret <- data.frame(
1031
-        results=I(list(results)),
988
+        results = I(list(results)),
1032 989
         tumor = I(list(tumor)),
1033 990
         project = I(list("ATAC-seq")),
1034 991
         data.type = I(list("ATAC-seq")),
1035
-        data.category = I(list("ATAC-seq")),
1036
-        legacy = I(list(FALSE))
992
+        data.category = I(list("ATAC-seq"))
1037 993
     )
1038 994
 
1039 995
     return(ret)
... ...
@@ -871,7 +871,6 @@ unlistlabels <- function(lab) {
871 871
 #' @importFrom data.table dcast setDT setDF :=
872 872
 #' @examples
873 873
 #' \dontrun{
874
-#' library(maftools)
875 874
 #' library(dplyr)
876 875
 #' query <- GDCquery(
877 876
 #'    project = "TCGA-CHOL",
... ...
@@ -929,7 +928,6 @@ TCGAvisualize_oncoprint <- function(
929 928
         annotation.legend.side = "bottom"
930 929
 ){
931 930
 
932
-
933 931
     check_package("ComplexHeatmap")
934 932
     check_package("circlize")
935 933
     check_package("grid")
... ...
@@ -34,15 +34,6 @@ Uses GDC API or GDC transfer tool to download gdc data
34 34
   The data from query will be save in a folder: project/data.category
35 35
 }
36 36
 \examples{
37
-query <- GDCquery(
38
-  project = "TCGA-ACC",
39
-  data.category =  "Copy number variation",
40
-  legacy = TRUE,
41
-  file.type = "hg19.seg",
42
-  barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01")
43
- )
44
-# data will be saved in  GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation
45
-GDCdownload(query, method = "api")
46 37
 \dontrun{
47 38
     # Download clinical data from XML
48 39
     query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical")
... ...
@@ -57,14 +48,14 @@ GDCdownload(query, method = "api")
57 48
     # data will be saved in:
58 49
     # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification
59 50
     GDCdownload(query, method = "client", directory = "example_data_dir")
60
-    acc.gbm <- GDCquery(
51
+    query_acc_gbm <- GDCquery(
61 52
         project =  c("TCGA-ACC","TCGA-GBM"),
62 53
         data.category = "Transcriptome Profiling",
63 54
         data.type = "Gene Expression Quantification",
64 55
         workflow.type = "STAR - Counts"
65 56
     )
66 57
     GDCdownload(
67
-       query = acc.gbm,
58
+       query = query_acc_gbm,
68 59
        method = "api",
69 60
        directory = "example",
70 61
        files.per.chunk = 50
... ...
@@ -9,7 +9,6 @@ GDCquery(
9 9
   data.category,
10 10
   data.type,
11 11
   workflow.type,
12
-  legacy = FALSE,
13 12
   access,
14 13
   platform,
15 14
   file.type,
... ...
@@ -90,18 +89,6 @@ List for harmonized database:
90 89
 \item{ Sequencing Reads }
91 90
 \item{ Simple Nucleotide Variation }
92 91
 \item{ Transcriptome Profiling }
93
-}
94
-List for legacy archive
95
-\itemize{
96
-\item{ Biospecimen }
97
-\item{ Clinical }
98
-\item{ Copy number variation }
99
-\item{ DNA methylation }
100
-\item{ Gene expression }
101
-\item{ Protein expression }
102
-\item{ Raw microarray data }
103
-\item{ Raw sequencing data }
104
-\item{ Simple nucleotide variation }
105 92
 }}
106 93
 
107 94
 \item{data.type}{A data type to filter the files to download
... ...
@@ -109,8 +96,6 @@ For the complete list please check the vignette.}
109 96
 
110 97
 \item{workflow.type}{GDC workflow type}
111 98
 
112
-\item{legacy}{Search in the legacy repository}
113
-
114 99
 \item{access}{Filter by access type. Possible values: controlled, open}
115 100
 
116 101
 \item{platform}{Example:
... ...
@@ -140,19 +125,13 @@ HumanMethylation27                \tab Mixed_DNASeq_Cont_curated      \cr
140 125
 IlluminaHiSeq_RNASeqV2            \tab Mixed_DNASeq_Cont
141 126
 }}
142 127
 
143
-\item{file.type}{To be used in the legacy database for some platforms,
144
-to define which file types to be used.}
145
-
146 128
 \item{barcode}{A list of barcodes to filter the files to download}
147 129
 
148 130
 \item{data.format}{Data format filter ("VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML",
149 131
 "TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX")}
150 132
 
151
-\item{experimental.strategy}{Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.
152
-Legacy:  WXS, RNA-Seq, miRNA-Seq, Genotyping Array,
153
-DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS,
154
-MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array,
155
-Total RNA-Seq, Capillary sequencing, Bisulfite-Seq}
133
+\item{experimental.strategy}{Filter to experimental strategy.
134
+Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.}
156 135
 
157 136
 \item{sample.type}{A sample type to filter the files to download}
158 137
 }
... ...
@@ -163,7 +142,6 @@ A data frame with the results and the parameters used
163 142
 Uses GDC API to search for search, it searches for both controlled and
164 143
   open-access data.
165 144
   For GDC data arguments project, data.category, data.type and workflow.type should be used
166
-  For the legacy data arguments project, data.category, platform and/or file.extension should be used.
167 145
   Please, see the vignette for a table with the possibilities.
168 146
 }
169 147
 \examples{
... ...
@@ -193,19 +171,6 @@ query <- GDCquery(
193 171
    data.type = "Masked Copy Number Segment",
194 172
    sample.type = c("Primary Tumor")
195 173
 )
196
-query.met <- GDCquery(
197
-   project = c("TCGA-GBM","TCGA-LGG"),
198
-   legacy = TRUE,
199
-   data.category = "DNA methylation",
200
-   platform = "Illumina Human Methylation 450"
201
-)
202
-query <- GDCquery(
203
-   project = "TCGA-ACC",
204
-   data.category =  "Copy number variation",
205
-   legacy = TRUE,
206
-   file.type = "hg19.seg",
207
-   barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")
208
-)
209 174
 }
210 175
 }
211 176
 \author{
... ...
@@ -87,7 +87,6 @@ Creating a oncoprint
87 87
 }
88 88
 \examples{
89 89
 \dontrun{
90
-library(maftools)
91 90
 library(dplyr)
92 91
 query <- GDCquery(
93 92
    project = "TCGA-CHOL",
... ...
@@ -1,17 +1,16 @@
1
-context("Download AND PREPARE")
2
-
3
-
1
+context("Download and prepare")
4 2
 
5 3
 test_that("GDCdownload API method is working ", {
6 4
     skip_on_bioc()
7 5
     skip_if_offline()
8 6
 
9
-    cases <-  c(
7
+    cases <- c(
10 8
         "TCGA-PA-A5YG-01A-11R-A29S-07",
11 9
         "TCGA-OR-A5JX-01A-11R-A29S-07",
12 10
         "TCGA-PK-A5HA-01A-11R-A29S-07",
13 11
         "TCGA-OR-A5KY-01A-11R-A29S-07"
14 12
     )
13
+
15 14
     acc <- GDCquery(
16 15
         project =  c("TCGA-ACC"),
17 16
         data.category = "Transcriptome Profiling",
... ...
@@ -20,8 +19,8 @@ test_that("GDCdownload API method is working ", {
20 19
         barcode = substr(cases,1,12)
21 20
     )
22 21
     GDCdownload(acc, method = "api", directory = "ex")
23
-
24 22
     obj <- GDCprepare(acc,  directory = "ex",summarizedExperiment = TRUE)
23
+
25 24
     expect_true(all(substr(colnames(obj),1,12) == substr(cases,1,12)))
26 25
     expect_true(all(obj$barcode == cases))
27 26
 
... ...
@@ -46,9 +45,6 @@ test_that("GDCdownload API method is working ", {
46 45
     expect_true(all(query$results[[1]]$sample.submitter_id == data$sample_submitter_id))
47 46
 })
48 47
 
49
-
50
-
51
-
52 48
 test_that("getBarcodeInfo works", {
53 49
     skip_on_bioc()
54 50
     skip_if_offline()
... ...
@@ -61,11 +57,14 @@ test_that("getBarcodeInfo works", {
61 57
     x <- getBarcodeInfo(c("TARGET-20-PARUDL-03A"))
62 58
     expect_true(all(cols %in% colnames(x)))
63 59
 
64
-    samples <- c("HCM-CSHL-0063-C18-85A",
65
-                 "HCM-CSHL-0065-C20-06A",
66
-                 "HCM-CSHL-0065-C20-85A",
67
-                 "HCM-CSHL-0063-C18-01A")
60
+    samples <- c(
61
+        "HCM-CSHL-0063-C18-85A",
62
+        "HCM-CSHL-0065-C20-06A",
63
+        "HCM-CSHL-0065-C20-85A",
64
+        "HCM-CSHL-0063-C18-01A"
65
+    )
68 66
     x <- colDataPrepare(samples)
67
+
69 68
     expect_true(all(rownames(x) == samples))
70 69
     expect_true(x[x$sample_submitter_id == "HCM-CSHL-0065-C20-06A","gender"] == "male")
71 70
     expect_true(x[x$sample_submitter_id == "HCM-CSHL-0065-C20-06A","tumor_grade"] == "G2")
... ...
@@ -102,22 +101,29 @@ test_that("colDataPrepare handle replicates", {
102 101
 test_that("GDCprepare accepts more than one project", {
103 102
     skip_on_bioc()
104 103
     skip_if_offline()
105
-    cases <-  c("TCGA-OR-A5JX-01A", "TCGA-OR-A5J3-01A",
106
-                "TCGA-06-0680-11A","TCGA-14-0871-01A")
104
+    cases <-  c(
105
+        "TCGA-OR-A5JX-01A",
106
+        "TCGA-OR-A5J3-01A",
107
+        "TCGA-06-0680-11A",
108
+        "TCGA-14-0871-01A"
109
+    )
107 110
     expect_true(all(c("TCGA-ACC","TCGA-GBM") %in% colDataPrepare(cases)$project_id))
108
-    acc.gbm <- GDCquery(project =  c("TCGA-ACC","TCGA-GBM"),
109
-                        data.category = "Transcriptome Profiling",
110
-                        data.type = "Gene Expression Quantification",
111
-                        workflow.type = "STAR - Counts",
112
-                        barcode = substr(cases,1,12))
113
-    GDCdownload(acc.gbm, method = "api", directory = "ex")
114
-    obj <- GDCprepare(acc.gbm,  directory = "ex")
111
+    query_acc_gbm <- GDCquery(
112
+        project =  c("TCGA-ACC","TCGA-GBM"),
113
+        data.category = "Transcriptome Profiling",
114
+        data.type = "Gene Expression Quantification",
115
+        workflow.type = "STAR - Counts",
116
+        barcode = substr(cases, 1, 12)
117
+    )
118
+    GDCdownload(query_acc_gbm, method = "api", directory = "ex")
119
+    obj <- GDCprepare(query_acc_gbm,  directory = "ex")
115 120
     expect_true(all(c("TCGA-ACC","TCGA-GBM") %in% SummarizedExperiment::colData(obj)$project_id))
116 121
 })
117 122
 
118 123
 test_that("Non TCGA data is processed", {
119 124
     skip_on_bioc()
120 125
     skip_if_offline()
126
+
121 127
     proj <- "MMRF-COMMPASS"
122 128
     query <- GDCquery(
123 129
         project = proj,
... ...
@@ -132,8 +138,6 @@ test_that("Non TCGA data is processed", {
132 138
         workflow.type = "STAR - Counts",
133 139
         barcode = getResults(query)$cases[1:4]
134 140
     )
135
-    #GDCdownload(query)
136
-    #data <- GDCprepare(query)
137 141
 })
138 142
 
139 143
 test_that("Gene Level Copy Number is being correctly prepare", {
... ...
@@ -151,7 +155,7 @@ test_that("Gene Level Copy Number is being correctly prepare", {
151 155
     data <- GDCprepare(query,directory = "ex")
152 156
 
153 157
     expect_true(all(substr(colnames(data),1,12) == c("TCGA-OR-A5JD","TCGA-OR-A5J7")))
154
-    unlink("ex",recursive = TRUE,force = TRUE)
158
+    unlink("ex", recursive = TRUE, force = TRUE)
155 159
 })
156 160
 
157 161
 test_that("DNAm files is processed correctly", {
... ...
@@ -170,28 +174,6 @@ test_that("DNAm files is processed correctly", {
170 174
     expect_lt(abs(assay(data.hg38)["cg16739396","TCGA-E2-A158-01A-11D-A12E-05"] - 0.0688655418909783),10^-10)
171 175
 })
172 176
 
173
-test_that("IDAT files is processed", {
174
-    skip_on_bioc()
175
-    skip_if_offline()
176
-
177
-    proj <- "TCGA-LUAD"
178
-    query <- GDCquery(
179
-        project = proj,
180
-        data.category = "Raw microarray data",
181
-        data.type = "Raw intensities",
182
-        experimental.strategy = "Methylation array",
183
-        legacy = TRUE,
184
-        file.type = ".idat",
185
-        barcode = "TCGA-55-7724",
186
-        platform = "Illumina Human Methylation 450"
187
-    )
188
-    #tryCatch(GDCdownload(query, method = "api", files.per.chunk = 20),
189
-    #         error = function(e) GDCdownload(query, method = "client"))
190
-    #betas <- GDCprepare(query)
191
-    #expect_true(nrow(betas) == 485577)
192
-    #expect_true(ncol(betas) == 1)
193
-})
194
-
195 177
 test_that("Prepare samples without clinical data", {
196 178
     skip_on_bioc()
197 179
     skip_if_offline()
... ...
@@ -214,30 +196,10 @@ test_that("Prepare multiple samples from the same patient", {
214 196
     expect_true("age_at_diagnosis" %in% colnames(x))
215 197
 })
216 198
 
217
-test_that("Preparing HT_HG-U133A as SE works", {
218
-    skip_on_bioc()
219
-    skip_if_offline()
220
-
221
-    query <- GDCquery(
222
-        project = "TCGA-GBM",
223
-        legacy = TRUE,
224
-        data.category = "Gene expression",
225
-        data.type = "Gene expression quantification",
226
-        platform = c("HT_HG-U133A")
227
-    )
228
-    query$results[[1]] <- query$results[[1]][1:2,]
229
-    GDCdownload(query, method = "api", files.per.chunk = 100)
230
-    se <- GDCprepare(query, summarizedExperiment = TRUE)
231
-
232
-    expect_true(is(se,"SummarizedExperiment"))
233
-})
234
-
235
-
236 199
 test_that("Preparing RRPA files with number of proteins works", {
237 200
     skip_on_bioc()
238 201
     skip_if_offline()
239 202
 
240
-
241 203
     query_rppa <- GDCquery(
242 204
         project = c("TCGA-COAD"),
243 205
         data.category = "Proteome Profiling",
... ...
@@ -249,9 +211,12 @@ test_that("Preparing RRPA files with number of proteins works", {
249 211
 
250 212
     GDCdownload(query_rppa)
251 213
 
252
-    expect_message(object = {
253
-        data_rppa <- GDCprepare(query_rppa)
254
-    },regexp = "Some files have a  different number of proteins, we will introduce NA for the missing values")
214
+    expect_message(
215
+        object = {
216
+            data_rppa <- GDCprepare(query_rppa)
217
+        },
218
+        regexp = "Some files have a  different number of proteins, we will introduce NA for the missing values"
219
+    )
255 220
 
256 221
     expect_true(is(data_rppa,"data.frame"))
257 222
 })
... ...
@@ -11,7 +11,7 @@ test_that("TCGAquery_SampleTypes returns the correct barcodes", {
11 11
 
12 12
 test_that("GDCquery_clinic populates correctly the data", {
13 13
     skip_on_bioc()
14
-    results <- GDCquery_clinic( "BEATAML1.0-COHORT")
14
+    results <- GDCquery_clinic(project = "BEATAML1.0-COHORT")
15 15
     results.2028 <- results[results$submitter_id == "2028",]
16 16
     expect_equal(results.2028$vital_status,"Alive")
17 17
     expect_true(
... ...
@@ -27,7 +27,7 @@ test_that("GDCquery_clinic populates correctly the data", {
27 27
     expect_equal(results.42$ethnicity,"not hispanic or latino")
28 28
     expect_equal(as.integer(results.2028$age_at_diagnosis %>% as.numeric() / 365.25),56)
29 29
 
30
-    results <- GDCquery_clinic( "TCGA-LUAD")
30
+    results <- GDCquery_clinic(project = "TCGA-LUAD")
31 31
     results.sample <- results[results$submitter_id == "TCGA-80-5608",]
32 32
     expect_equal(results.sample$vital_status,"Alive")
33 33
     expect_equal(results.sample$gender,"female")
... ...
@@ -20,16 +20,19 @@ test_that("GDCquery accepts more than one project", {
20 20
         data.category = "Copy Number Variation",
21 21
         data.type = "Copy Number Segment"
22 22
     )
23
+
23 24
     gbm <- GDCquery(
24 25
         project = "TCGA-GBM",
25 26
         data.category = "Copy Number Variation",
26 27
         data.type = "Copy Number Segment"
27 28
     )
29
+
28 30
     acc.gbm <- GDCquery(
29 31
         project =  c("TCGA-ACC","TCGA-GBM"),
30 32
         data.category = "Copy Number Variation",
31 33
         data.type = "Copy Number Segment"
32 34
     )
35
+
33 36
     expect_equal(unique(acc.gbm$results[[1]]$data_type),"Copy Number Segment")
34 37
     expect_equal(nrow(acc.gbm$results[[1]]), sum(nrow(acc$results[[1]]),nrow(gbm$results[[1]])))
35 38
     expect_true(nrow(dplyr::anti_join(acc$results[[1]],acc.gbm$results[[1]], by = "file_id")) == 0)
... ...
@@ -51,34 +54,24 @@ test_that("GDCquery can filter by sample.type", {
51 54
     expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type)
52 55
 
53 56
     sample.type <- "Solid Tissue Normal"
54
-    query <- GDCquery(project = "TCGA-ACC",
55
-                      data.category =  "Copy Number Variation",
56
-                      data.type = "Masked Copy Number Segment",
57
-                      sample.type = sample.type)
58
-    expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type)
59
-
60
-    sample.type <- "Solid Tissue Normal"
61
-    query <- GDCquery(project =  c("TCGA-COAD"),
62
-                      data.category = "Transcriptome Profiling",
63
-                      data.type = "Gene Expression Quantification",
64
-                      workflow.type = "STAR - Counts",
65
-                      sample.type = sample.type)
57
+    query <- GDCquery(
58
+        project = "TCGA-ACC",
59
+        data.category =  "Copy Number Variation",
60
+        data.type = "Masked Copy Number Segment",
61
+        sample.type = sample.type
62
+    )
66 63
     expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type)
67 64
 
68
-
69 65
     sample.type <- "Solid Tissue Normal"
70
-    query <- GDCquery(project = "TCGA-BRCA",
71
-                      legacy = TRUE,
72
-                      data.category = "Gene expression",
73
-                      data.type = "Gene expression quantification",
74
-                      platform = "Illumina HiSeq",
75
-                      file.type = "results",
76
-                      experimental.strategy = "RNA-Seq",
77
-                      sample.type = sample.type)
66
+    query <- GDCquery(
67
+        project =  c("TCGA-COAD"),
68
+        data.category = "Transcriptome Profiling",
69
+        data.type = "Gene Expression Quantification",
70
+        workflow.type = "STAR - Counts",
71
+        sample.type = sample.type
72
+    )
78 73
     expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type)
79 74
 
80
-
81
-
82 75
     sample.type <- c("Solid Tissue Normal", "Primary Tumor")
83 76
     query <- GDCquery(
84 77
         project = "TCGA-ACC",
... ...
@@ -121,56 +114,6 @@ test_that("GDCquery can filter by barcode", {
121 114
     expect_true(!all(c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") %in% query$results[[1]]$cases))
122 115
 })
123 116
 
124
-test_that("GDCquery can filter copy number from legacy data by file type. Case: nocnv_hg18", {
125
-    skip_on_bioc()
126
-    skip_if_offline()
127
-
128
-    query <- GDCquery(project = "TCGA-ACC",
129
-                      data.category =  "Copy number variation",
130
-                      legacy = TRUE,
131
-                      file.type = "nocnv_hg18.seg",
132
-                      barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01"))
133
-    expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.nocnv_hg18.seg.txt")
134
-})
135
-
136
-test_that("GDCquery can filter copy number from legacy data by file type. Case: hg18", {
137
-    skip_on_bioc()
138
-    skip_if_offline()
139
-
140
-    query <- GDCquery(project = "TCGA-ACC",
141
-                      data.category =  "Copy number variation",
142
-                      legacy = TRUE,
143
-                      file.type = "hg18.seg",
144
-                      barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01"))
145
-    expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.hg18.seg.txt")
146
-})
147
-
148
-test_that("GDCquery can filter copy number from legacy data by file type. Case: hg19", {
149
-    skip_on_bioc()
150
-    skip_if_offline()
151
-
152
-    query <- GDCquery(project = "TCGA-ACC",
153
-                      data.category =  "Copy number variation",
154
-                      legacy = TRUE,
155
-                      file.type = "hg19.seg",
156
-                      barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01"))
157
-    expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.hg19.seg.txt")
158
-})
159
-
160
-
161
-test_that("GDCquery can filter copy number from legacy data by file type. Case: nocnv_hg19", {
162
-    skip_on_bioc()
163
-    skip_if_offline()
164
-
165
-    query <- GDCquery(project = "TCGA-ACC",
166
-                      data.category =  "Copy number variation",
167
-                      legacy = TRUE,
168
-                      file.type = "nocnv_hg19.seg",
169
-                      barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01"))
170
-    expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.nocnv_hg19.seg.txt")
171
-
172
-})
173
-
174 117
 
175 118
 test_that("GDCquery can filter by access level", {
176 119
     skip_on_bioc()
... ...
@@ -186,15 +129,12 @@ test_that("GDCquery can filter by access level", {
186 129
     expect_equal(unique(query$results[[1]]$access),"controlled")
187 130
 })
188 131
 
189
-
190
-
191
-
192 132
 test_that("getNbFiles and getNbCases works", {
193 133
     skip_on_bioc()
194 134
     skip_if_offline()
195 135
 
196 136
     aux <- getProjectSummary("TCGA-LUAD",TRUE)
197
-    files <- getNbFiles("TCGA-LUAD","Raw microarray data",legacy = T)
137
+    files <- getNbFiles("TCGA-LUAD","Raw microarray data")
198 138
     cases <- getNbCases("TCGA-LUAD","Raw microarray data")
199 139
     expect_true(cases < files)
200 140
 })
... ...
@@ -72,18 +72,8 @@ which defines the output type a Summarized Experiment (default option) or a data
72 72
 To create a summarized Experiment object we annotate the data with genomic positions
73 73
 with last patch release version of the genome available. 
74 74
 
75
-For legacy data (data aligned to hg19) TCGAbiolinks is using GRCh37.p13 and for 
76
-harmonized data (data aligned to hg38) now it is using Gencode version 36.
77 75
 
78
-Unfortunately, some of the updates changes/remove gene symbols, change coordinates, etc. 
79
-Which might introduce some loss of data. For example, if the gene was removed we cannot map
80
-it anymore and that information will be lost in the `SummarizedExperiment`.
81
-
82
-If you set `SummarizedExperiment` to `FALSE`, you will get the data unmodified 
83
-just as they are in the files and ad your own annotation.
84
-
85
-Also, there are no updated for DNA methylation data. But the last metadata available can be found
86
-here: [http://zwdzwd.github.io/InfiniumAnnotation](http://zwdzwd.github.io/InfiniumAnnotation)
76
+Also,  the latest DNA methylation metadata is available at: [http://zwdzwd.github.io/InfiniumAnnotation](http://zwdzwd.github.io/InfiniumAnnotation)
87 77
 
88 78
 </div>
89 79
 </div>
... ...
@@ -132,48 +122,6 @@ in `GDCprepare` and `GDCdownload`
132 122
 | mut.pipeline 	| If add.gistic2.mut is not NULL this field will be taken in consideration. Four separate variant calling pipelines are implemented for GDC data harmonization. Options: muse, varscan2, somaticsniper, MuTect2. For more information: https://gdc-docs.nci.nih.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/ 	|
133 123
 | mutant_variant_classification 	| List of mutant_variant_classification that will be consider a sample mutant or not. Default: "Frame_Shift_Del", "Frame_Shift_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del", "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation" 	|
134 124
 
135
-## Search and download data from legacy database using GDC api method
136
-
137
-In this example we will download gene expression data from legacy database (data 
138
-aligned against genome of reference hg19) using GDC api method and  we will show object data and metadata.
139
-```{r results = 'hide', message=FALSE, warning=FALSE, eval = F}
140
-query <- GDCquery(
141
-    project = "TCGA-GBM",
142
-    data.category = "Gene expression",
143
-    data.type = "Gene expression quantification",
144
-    platform = "Illumina HiSeq", 
145
-    file.type  = "normalized_results",
146
-    experimental.strategy = "RNA-Seq",
147
-    barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"),
148
-    legacy = TRUE
149
-)
150
-GDCdownload(
151
-    query = query, 
152
-    method = "api", 
153
-    files.per.chunk = 10
154
-)
155
-data <- GDCprepare(query = query)
156
-```
157
-
158
-```{r message=FALSE, warning=FALSE, include=FALSE}
159
-data <- gbm.exp.legacy
160
-```
161
-
162
-```{r message=FALSE, warning=FALSE}
163
-# Gene expression aligned against hg19.
164
-datatable(
165
-    as.data.frame(colData(data)), 
166
-    options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
167
-    rownames = FALSE)
168
-# Only first 20 rows to make render faster
169
-datatable(
170
-    assay(data)[1:20,], 
171
-    options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
172
-    rownames = TRUE
173
-)
174
-
175
-rowRanges(data)
176
-```
177 125
 
178 126
 
179 127
 ## Search and download data for two samples from database
... ...
@@ -238,44 +186,6 @@ Examples of query, download, prepare can be found in this [gist](https://gist.gi
238 186
 | Biospecimen                 | Biospecimen Supplement            |      |             |
239 187
 | Clinical                    |                |       |                  |
240 188
 
241
-## Legacy data
242
-| Data.category               | Data.type                         | Platform                            | file.type          | Status          |
243
-|-----------------------------|-----------------------------------|-------------------------------------|--------------------|-----------------|
244
-| Transcriptome Profiling     |                                   |                                     |                    |                 |
245
-| Copy number variation       | -                                 | Affymetrix SNP Array 6.0            | nocnv_hg18.seg     | Working         |
246
-|                             | -                                 | Affymetrix SNP Array 6.0            | hg18.seg           | Working         |
247
-|                             | -                                 | Affymetrix SNP Array 6.0            | nocnv_hg19.seg     | Working         |
248
-|                             | -                                 | Affymetrix SNP Array 6.0            | hg19.seg           | Working         |
249
-|                             | -                                 | Illumina HiSeq                      | Several            | Working         |
250
-| Simple Nucleotide Variation | Simple somatic mutation           |                                     |                    |                 |
251
-| Raw Sequencing Data         |                                   |                                     |                    |                 |
252
-| Biospecimen                 |                                   |                                     |                    |                 |
253
-| Clinical                    |                                   |                                     |                    |                 |
254
-| Protein expression          |                                   | MDA RPPA Core                       | -                  | Working         |
255
-| Gene expression             | Gene expression quantification    | Illumina HiSeq                      | normalized_results | Working         |
256
-|                             |                                   | Illumina HiSeq                      | results            | Working         |
257
-|                             |                                   | HT_HG-U133A                         | -                  | Working         |
258
-|                             |                                   | AgilentG4502A_07_2                  | -                  | Data frame only |
259
-|                             |                                   | AgilentG4502A_07_1                  | -                  | Data frame only |
260
-|                             |                                   | HuEx-1_0-st-v2                      | FIRMA.txt          | Not Preparing   |
261
-|                             |                                   |                                     | gene.txt           | Not Preparing   |
262
-|                             | Isoform expression quantification |                                     |                    |                 |
263
-|                             | miRNA gene quantification         |                                     |                    |                 |
264
-|                             | Exon junction quantification      |                                     |                    |                 |
265
-|                             | Exon quantification               |                                     |                    |                 |
266
-|                             | miRNA isoform quantification      |                                     |                    |                 |
267
-|                             |                                   |                                     |                    |                 |
268
-| DNA methylation             |                                   | Illumina Human Methylation 450      | Not used           | Working         |
269
-|                             |                                   | Illumina Human Methylation 27       | Not used           | Working         |
270
-|                             |                                   | Illumina DNA Methylation OMA003 CPI | Not used           | Working         |
271
-|                             |                                   | Illumina DNA Methylation OMA002 CPI | Not used           | Working         |
272
-|                             |                                   | Illumina Hi Seq                     |                    | Not  working    |
273
-| Raw Microarray Data         |                                   |                                     |                    |                 |
274
-| Structural Rearrangement    |                                   |                                     |                    |                 |
275
-| Other                       |                                   |                                     |                    |                 |
276
-
277
-
278
-
279 189
 # Examples
280 190
 
281 191
 
... ...
@@ -444,8 +354,7 @@ query <- GDCquery(
444 354
     project = "TCGA-BRCA",
445 355
     data.category = "DNA Methylation",
446 356
     data.type = "Masked Intensities",
447
-    platform = "Illumina Human Methylation 27",
448
-    legacy = FALSE
357
+    platform = "Illumina Human Methylation 27"
449 358
 )
450 359
 GDCdownload(query, files.per.chunk=10)
451 360
 betas <- GDCprepare(query)
... ...
@@ -454,10 +363,9 @@ query <- GDCquery(
454 363
     project = "HCMI-CMDC",
455 364
     data.category = "DNA Methylation",
456 365
     data.type = "Masked Intensities",
457
-    platform = "Illumina Methylation Epic",
458
-    legacy = FALSE
366
+    platform = "Illumina Methylation Epic"
459 367
 )
460
-GDCdownload(query, files.per.chunk=10)
368
+GDCdownload(query, files.per.chunk = 10)
461 369
 betas <- GDCprepare(query)
462 370
 
463 371
 
... ...
@@ -465,8 +373,7 @@ query <- GDCquery(
465 373
     project = "CPTAC-3",
466 374
     data.category = "DNA Methylation",
467 375
     data.type = "Masked Intensities",
468
-    platform = "Illumina Methylation Epic",
469
-    legacy = FALSE
376
+    platform = "Illumina Methylation Epic"
470 377
 )
471 378
 GDCdownload(query, files.per.chunk=10)
472 379
 betas <- GDCprepare(query)
... ...
@@ -475,10 +382,9 @@ query <- GDCquery(
475 382
     project = "TCGA-BRCA",
476 383
     data.category = "DNA Methylation",
477 384
     data.type = "Masked Intensities",
478
-    platform = "Illumina Methylation Epic",
479
-    legacy = FALSE
385
+    platform = "Illumina Methylation Epic"
480 386
 )
481
-GDCdownload(query, files.per.chunk=10)
387
+GDCdownload(query, files.per.chunk = 10)
482 388
 betas <- GDCprepare(query)
483 389
 
484 390
 
... ...
@@ -571,7 +477,6 @@ https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeli
571 477
 query.sc.analysis <- GDCquery(
572 478
     project = "CPTAC-3", 
573 479
     data.category = "Transcriptome Profiling",
574
-    legacy = FALSE,
575 480
     access = "open",
576 481
     data.type = "Single Cell Analysis",
577 482
     data.format =  "TSV"
... ...
@@ -584,7 +489,6 @@ Single.Cell.Analysis.list <- GDCprepare(query.sc.analysis)
584 489
 query.hdF5 <- GDCquery(
585 490
     project = "CPTAC-3", 
586 491
     data.category = "Transcriptome Profiling",
587
-    legacy = FALSE,
588 492
     access = "open",
589 493
     data.type = "Single Cell Analysis",
590 494
     barcode = c("CPT0167860015","CPT0206880004"),
... ...
@@ -598,7 +502,6 @@ df.HDF5 <- GDCprepare(query.hdF5)
598 502
 query.raw.counts <- GDCquery(
599 503
     project = "CPTAC-3", 
600 504
     data.category = "Transcriptome Profiling",
601
-    legacy = FALSE,
602 505
     access = "open",
603 506
     data.type = "Gene Expression Quantification",
604 507
     barcode = c("CPT0167860015","CPT0206880004"),
... ...
@@ -612,7 +515,6 @@ raw.counts.list <- GDCprepare(query.raw.counts)
612 515
 query.filtered.counts <- GDCquery(
613 516
     project = "CPTAC-3", 
614 517
     data.category = "Transcriptome Profiling",
615
-    legacy = FALSE,
616 518
     access = "open",
617 519
     data.type = "Gene Expression Quantification",
618 520
     barcode = c("CPT0167860015","CPT0206880004"),
... ...
@@ -627,7 +529,6 @@ filtered.counts.list <- GDCprepare(query.filtered.counts)
627 529
 query.sc.dea <- GDCquery(
628 530
     project = "CPTAC-3", 
629 531
     data.category = "Transcriptome Profiling",
630
-    legacy = FALSE,
631 532
     access = "open",
632 533
     data.type = "Differential Gene Expression",
633 534
     barcode = c("CPT0167860015","CPT0206880004"),
... ...
@@ -636,91 +537,3 @@ query.sc.dea <- GDCquery(
636 537
 GDCdownload(query.sc.dea)
637 538
 sc.dea.list <- GDCprepare(query.sc.dea)
638 539
 ```
639
-
640
-## Legacy archive: data aligned against hg19
641
-
642
-### DNA methylation: Get all TCGA IDAT files
643
-
644
-```{r message=FALSE, warning=FALSE, eval =FALSE}
645
-#-------------------------------------------------------
646
-# Example to idat files from TCGA projects
647
-#-------------------------------------------------------
648
-projects <- TCGAbiolinks:::getGDCprojects()$project_id
649
-projects <- projects[grepl('^TCGA',projects,perl=T)]
650
-match.file.cases.all <- NULL
651
-for(proj in projects){
652
-    print(proj)
653
-    query <- GDCquery(
654
-        project = proj,
655
-        data.category = "Raw microarray data",
656
-        data.type = "Raw intensities", 
657
-        experimental.strategy = "Methylation array", 
658
-        legacy = TRUE,
659
-        file.type = ".idat",
660
-        platform = "Illumina Human Methylation 450"
661
-    )
662
-    match.file.cases <- getResults(query,cols=c("cases","file_name"))
663
-    match.file.cases$project <- proj
664
-    match.file.cases.all <- rbind(match.file.cases.all,match.file.cases)
665
-    tryCatch(
666
-        GDCdownload(query, method = "api", files.per.chunk = 20),
667
-        error = function(e) GDCdownload(query, method = "client")
668
-    )
669
-}
670
-# This will create a map between idat file name, cases (barcode) and project
671
-readr::write_tsv(match.file.cases.all, path =  "idat_filename_case.txt")
672
-# code to move all files to local folder
673
-for(file in dir(".",pattern = ".idat", recursive = T)){
674
-    TCGAbiolinks::move(file,basename(file))
675
-}
676
-```
677
-
678
-
679
-### DNA methylation
680
-
681
-```{r, eval = FALSE}
682
-query_meth.hg19 <- GDCquery(
683
-    project= "TCGA-LGG", 
684
-    data.category = "DNA methylation", 
685
-    platform = "Illumina Human Methylation 450", 
686
-    barcode = c("TCGA-HT-8111-01A-11D-2399-05","TCGA-HT-A5R5-01A-11D-A28N-05"), 
687
-    legacy = TRUE
688
-)
689
-GDCdownload(query_meth.hg19)
690
-data.hg19 <- GDCprepare(query_meth.hg19)
691
-```
692
-
693
-
694
-### Protein expression
695
-```{r, eval = FALSE}
696
-query <- GDCquery(
697
-    project = "TCGA-GBM",
698
-    data.category = "Protein expression",
699
-    legacy = TRUE, 
700
-    barcode = c("TCGA-OX-A56R-01A-21-A44T-20","TCGA-08-0357-01A-21-1898-20")
701
-)
702
-GDCdownload(query)
703
-data <- GDCprepare(
704
-    query, save = TRUE, 
705
-    save.filename = "gbmProteinExpression.rda",
706
-    remove.files.prepared = TRUE
707
-)
708
-```
709
-
710
-
711
-### Gene expression
712
-```{r, eval = FALSE}
713
-# Aligned against Hg19
714
-query.exp.hg19 <- GDCquery(
715
-    project = "TCGA-GBM",
716
-    data.category = "Gene expression",
717
-    data.type = "Gene expression quantification",
718
-    platform = "Illumina HiSeq", 
719
-    file.type  = "normalized_results",
720
-    experimental.strategy = "RNA-Seq",
721
-    barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"),
722
-    legacy = TRUE
723
-)
724
-GDCdownload(query.exp.hg19)
725
-data <- GDCprepare(query.exp.hg19)
726
-```
... ...
@@ -18,8 +18,6 @@ knitr::opts_knit$set(progress = FALSE)
18 18
 
19 19
 
20 20
 **TCGAbiolinks** has provided a few functions to search GDC database.
21
-This section starts by explaining the different GDC sources (Harmonized and Legacy Archive), followed by some examples
22
-how to access them.
23 21
 
24 22
 
25 23
 ---
... ...
@@ -33,23 +31,6 @@ library(DT)
33 31
 
34 32
 #  Useful information
35 33
 
36
-<div class="panel panel-info">
37
-<div class="panel-heading">Different sources: Legacy vs Harmonized</div>
38
-<div class="panel-body">
39
-
40
-
41
-There are two available sources to download GDC data using TCGAbiolinks:
42
-
43
-- GDC Legacy Archive : provides access to an unmodified copy of data that was previously stored in
44
-[CGHub](https://cghub.ucsc.edu/) and in the TCGA Data Portal hosted by the TCGA Data Coordinating Center (DCC), in which uses
45
-as references GRCh37 (hg19) and GRCh36 (hg18).
46
-- GDC harmonized database: data available was harmonized against GRCh38 (hg38) using GDC Bioinformatics Pipelines
47
-which provides methods to the standardization of biospecimen and
48
-clinical data.
49
-
50
-</div>
51
-</div>
52
-
53 34
 
54 35
 <div class="panel panel-info">
55 36
 <div class="panel-heading">Understanding the barcode</div>
... ...
@@ -79,7 +60,6 @@ with the following arguments:
79 60
 | data.category 	| A valid project (see list with TCGAbiolinks:::getProjectSummary(project)) 	|  	|
80 61
 | data.type 	| A data type to filter the files to download 	|  	|
81 62
 | workflow.type 	| GDC workflow type 	|  	|
82
-| legacy 	| Search in the legacy repository 	|  	|
83 63
 | access 	| Filter by access type. Possible values: controlled, open 	|  	|
84 64
 | platform 	| Example: 	|  	|
85 65
 |  	| CGH- 1x1M_G4447A 	| IlluminaGA_RNASeqV2 	|
... ...
@@ -107,7 +87,7 @@ with the following arguments:
107 87
 |  	| IlluminaHiSeq_RNASeqV2 	| Mixed_DNASeq_Cont 	|
108 88
 | file.type 	| To be used in the legacy database for some platforms, to define which file types to be used. 	|  	|
109 89
 | barcode 	| A list of barcodes to filter the files to download 	|  	|
110
-| experimental.strategy 	| Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. Legacy: WXS, RNA-Seq, miRNA-Seq, Genotyping Array, DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS, MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array, Total RNA-Seq, Capillary sequencing, Bisulfite-Seq 	|  	|
90
+| experimental.strategy 	| Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. |  	|
111 91
 | sample.type 	| A sample type to filter the files to download 	|  	|
112 92
 
113 93
 
... ...
@@ -138,7 +118,7 @@ datatable(
138 118
 The other fields (data.category, data.type, workflow.type, platform, file.type) can be found below. 
139 119
 Please, note that these tables are still incomplete.
140 120
 
141
-## Harmonized data options (`legacy = FALSE`)
121
+## Harmonized data options 
142 122
 
143 123
 ```{r, echo=FALSE}
144 124
 datatable(
... ...
@@ -149,21 +129,12 @@ datatable(
149 129
 )
150 130
 ```
151 131
 
152
-## Legacy archive data  options (`legacy = TRUE`)
153
-```{r, echo=FALSE}
154
-datatable(
155
-    readr::read_csv("https://docs.google.com/spreadsheets/d/1f98kFdj9mxVDc1dv4xTZdx8iWgUiDYO-qiFJINvmTZs/export?format=csv&gid=1817673686",col_types = readr::cols()),
156
-    filter = 'top',
157
-    options = list(scrollX = TRUE, keys = TRUE, pageLength = 40), 
158
-    rownames = FALSE
159
-)
160
-```
161 132
 
162 133
 # Harmonized database examples
163 134
 
164 135
 ## DNA methylation data: Recurrent tumor samples
165 136
 
166
-In this example we will access the harmonized database (`legacy = FALSE`) 
137
+In this example we will access the harmonized database
167 138
 and search for all DNA methylation data for recurrent glioblastoma multiform (GBM) 
168 139
 and low grade gliomas (LGG) samples.
169 140
 
... ...
@@ -172,7 +143,6 @@ and low grade gliomas (LGG) samples.
172 143
 query <- GDCquery(
173 144
     project = c("TCGA-GBM", "TCGA-LGG"),
174 145
     data.category = "DNA Methylation",
175
-    legacy = FALSE,
176 146
     platform = c("Illumina Human Methylation 450"),
177 147
     sample.type = "Recurrent Tumor"
178 148
 )
... ...
@@ -186,19 +156,18 @@ datatable(
186 156
 
187 157
 ## Samples with DNA methylation and gene expression data
188 158
 
189
-In this example we will access the harmonized database (`legacy = FALSE`) 
159
+In this example we will access the harmonized database 
190 160
 and search for all patients with DNA methylation (platform HumanMethylation450k) and gene expression data
191 161
 for Colon Adenocarcinoma tumor (TCGA-COAD).
192 162
 
193 163
 
194 164
 ```{r message=FALSE, warning = FALSE, eval = FALSE}
195
-query.met <- GDCquery(
165
+query_met <- GDCquery(
196 166
     project = "TCGA-COAD",
197 167
     data.category = "DNA Methylation",
198
-    legacy = FALSE,
199 168
     platform = c("Illumina Human Methylation 450")
200 169
 )
201
-query.exp <- GDCquery(
170
+query_exp <- GDCquery(
202 171
     project = "TCGA-COAD",
203 172
     data.category = "Transcriptome Profiling",
204 173
     data.type = "Gene Expression Quantification", 
... ...
@@ -207,20 +176,19 @@ query.exp <- GDCquery(
207 176
 
208 177
 # Get all patients that have DNA methylation and gene expression.
209 178
 common.patients <- intersect(
210
-    substr(getResults(query.met, cols = "cases"), 1, 12),
211
-    substr(getResults(query.exp, cols = "cases"), 1, 12)
179
+    substr(getResults(query_met, cols = "cases"), 1, 12),
180
+    substr(getResults(query_exp, cols = "cases"), 1, 12)
212 181
 )
213 182
 
214 183
 # Only seelct the first 5 patients
215
-query.met <- GDCquery(
184
+query_met <- GDCquery(
216 185
     project = "TCGA-COAD",
217 186
     data.category = "DNA Methylation",
218
-    legacy = FALSE,
219 187
     platform = c("Illumina Human Methylation 450"),
220 188
     barcode = common.patients[1:5]
221 189
 )
222 190
 
223
-query.exp <- GDCquery(
191
+query_exp <- GDCquery(
224 192
     project = "TCGA-COAD",
225 193
     data.category = "Transcriptome Profiling",
226 194
     data.type = "Gene Expression Quantification", 
... ...
@@ -231,13 +199,13 @@ query.exp <- GDCquery(
231 199
 
232 200
 ```{r results_matched, message=FALSE, warning=FALSE, eval = FALSE}
233 201
 datatable(
234
-    getResults(query.met, cols = c("data_type","cases")),
202
+    getResults(query_met, cols = c("data_type","cases")),
235 203
     filter = 'top',
236 204
     options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
237 205
     rownames = FALSE
238 206
 )
239 207
 datatable(
240
-    getResults(query.exp, cols = c("data_type","cases")), 
208
+    getResults(query_exp, cols = c("data_type","cases")), 
241 209
     filter = 'top',
242 210
     options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
243 211
     rownames = FALSE
... ...
@@ -327,98 +295,13 @@ datatable(
327 295
 ```
328 296
 
329 297
 
330
-# Legacy archive examples
331
-
332
-## DNA methylation
333
-
334
-### Array-based assays
335
-
336
-This example shows how the user can search for  glioblastoma multiform (GBM) 
337
-and DNA methylation data 
338
-for platform Illumina Human Methylation 450 and Illumina Human Methylation 27.
339
-
340
-```{r message=FALSE, warning=FALSE}
341
-query <- GDCquery(
342
-    project = c("TCGA-GBM"),
343
-    legacy = TRUE,
344
-    data.category = "DNA methylation",
345
-    platform = c("Illumina Human Methylation 450", "Illumina Human Methylation 27")
346
-)
347
-datatable(
348
-    getResults(query, rows = 1:100), 
349
-    filter = 'top',
350
-    options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
351
-    rownames = FALSE
352
-)
353
-```
354
-
355
-### whole-genome bisulfite sequencing (WGBS) 
356
-
357
-```{r message = FALSE, warning = FALSE, eval = FALSE}
358
-
359
-query <- GDCquery(
360
-    project = c("TCGA-LUAD"),
361
-    legacy = TRUE,
362
-    data.category = "DNA methylation",
363
-    data.type = "Methylation percentage",
364
-    experimental.strategy = "Bisulfite-Seq"
365
-)
366
-
367
-# VCF - controlled data
368
-query <- GDCquery(
369
-    project = c("TCGA-LUAD"),
370
-    legacy = TRUE,
371
-    data.category = "DNA methylation",
372
-    data.type = "Bisulfite sequence alignment",
373
-    experimental.strategy = "Bisulfite-Seq"
374
-)
375
-
376
-
377
-# WGBS BAM files - controlled data
378
-query <- GDCquery(
379
-    project = c("TCGA-LUAD"),
380
-    legacy = TRUE,
381
-    data.type = "Aligned reads",
382
-    data.category = "Raw sequencing data",
383
-    experimental.strategy = "Bisulfite-Seq"
384
-)
385
-```
386
-
387
-
388
-## Gene expression
389
-
390
-This exmaple shows how the user can search for  glioblastoma multiform (GBM) 
391
-gene expression data with the normalized results for expression of a gene. 
392
-For more information about file.types check [GDC TCGA file types](https://gdc.cancer.gov/resources-tcga-users/legacy-archive-tcga-tag-descriptions)
393
-
394
-```{r message=FALSE, warning=FALSE}
395
-# Gene expression aligned against hg19.
396
-query.exp.hg19 <- GDCquery(
397
-    project = "TCGA-GBM",
398
-    data.category = "Gene expression",
399
-    data.type = "Gene expression quantification",
400
-    platform = "Illumina HiSeq", 
401
-    file.type  = "normalized_results",
402
-    experimental.strategy = "RNA-Seq",
403
-    barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"),
404
-    legacy = TRUE
405
-)
406
-
407
-datatable(
408
-    getResults(query.exp.hg19), 
409
-    filter = 'top',
410
-    options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
411
-    rownames = FALSE
412
-)
413
-```
414
-
415 298
 # Get Manifest file
416 299
 
417 300
 If you want to get the manifest file from the query object you can use the function *getManifest*. If you 
418
-set save to TRUEm a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version [ddt-ui](https://github.com/NCI-GDC/dtt-ui) will be created.
301
+set save to `TRUE` a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version [ddt-ui](https://github.com/NCI-GDC/dtt-ui) will be created.
419 302
 
420 303
 ```{r message=FALSE, warning=FALSE}
421
-getManifest(query.exp.hg19,save = FALSE) 
304
+getManifest(query,save = FALSE) 
422 305
 ```
423 306
 
424 307
 # ATAC-seq data
... ...
@@ -440,10 +323,10 @@ datatable(
440 323
 You can use the function `GDCquery_ATAC_seq` filter the manifest table and use `GDCdownload` to save the data locally.
441 324
 ```{r message=FALSE, warning=FALSE,eval = FALSE}
442 325
 query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "rds") 
443
-GDCdownload(query,method = "client")
326
+GDCdownload(query, method = "client")
444 327
 
445 328
 query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "bigWigs") 
446
-GDCdownload(query,method = "client")
329
+GDCdownload(query, method = "client")
447 330
 
448 331
 ```
449 332