... | ... |
@@ -344,13 +344,16 @@ GDCquery_clinic <- function( |
344 | 344 |
} else { |
345 | 345 |
# HTMCP-03-06-02061 has two diagnosis |
346 | 346 |
x$submitter_id <- gsub("_diagnosis.*","",x$submitter_id) |
347 |
+ # If there are two rows for the same submitter_id |
|
348 |
+ # we will collapse them into one single row |
|
349 |
+ # concatanating all columns using ; |
|
347 | 350 |
aux <- x %>% dplyr::group_by(submitter_id) %>% |
348 |
- dplyr::summarise_each(funs(paste(unique(.), collapse = ";"))) |
|
351 |
+ summarise(across(everything(),~ paste(unique(.), collapse = ";"))) |
|
349 | 352 |
aux$treatments <- list(dplyr::bind_rows(x$treatments)) |
350 | 353 |
aux |
351 | 354 |
} |
352 | 355 |
} |
353 |
- ),fill = T |
|
356 |
+ ), fill = TRUE |
|
354 | 357 |
) |
355 | 358 |
#df$submitter_id <- gsub("^d|_diagnosis|diag-|-DX|-DIAG|-diagnosis","", df$submitter_id) |
356 | 359 |
# ^d ORGANOID-PANCREATIC |
... | ... |
@@ -500,7 +503,7 @@ GDCprepare_clinic <- function( |
500 | 503 |
} |
501 | 504 |
|
502 | 505 |
# Get all the clincal xml files |
503 |
- source <- ifelse(query$legacy,"legacy","harmonized") |
|
506 |
+ source <- "harmonized" |
|
504 | 507 |
files <- file.path( |
505 | 508 |
query$results[[1]]$project, source, |
506 | 509 |
gsub(" ","_",query$results[[1]]$data_category), |
... | ... |
@@ -16,15 +16,6 @@ |
16 | 16 |
#' @importFrom methods is |
17 | 17 |
#' @export |
18 | 18 |
#' @examples |
19 |
-#' query <- GDCquery( |
|
20 |
-#' project = "TCGA-ACC", |
|
21 |
-#' data.category = "Copy number variation", |
|
22 |
-#' legacy = TRUE, |
|
23 |
-#' file.type = "hg19.seg", |
|
24 |
-#' barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01") |
|
25 |
-#' ) |
|
26 |
-#' # data will be saved in GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation |
|
27 |
-#' GDCdownload(query, method = "api") |
|
28 | 19 |
#' \dontrun{ |
29 | 20 |
#' # Download clinical data from XML |
30 | 21 |
#' query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical") |
... | ... |
@@ -39,14 +30,14 @@ |
39 | 30 |
#' # data will be saved in: |
40 | 31 |
#' # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification |
41 | 32 |
#' GDCdownload(query, method = "client", directory = "example_data_dir") |
42 |
-#' acc.gbm <- GDCquery( |
|
33 |
+#' query_acc_gbm <- GDCquery( |
|
43 | 34 |
#' project = c("TCGA-ACC","TCGA-GBM"), |
44 | 35 |
#' data.category = "Transcriptome Profiling", |
45 | 36 |
#' data.type = "Gene Expression Quantification", |
46 | 37 |
#' workflow.type = "STAR - Counts" |
47 | 38 |
#' ) |
48 | 39 |
#' GDCdownload( |
49 |
-#' query = acc.gbm, |
|
40 |
+#' query = query_acc_gbm, |
|
50 | 41 |
#' method = "api", |
51 | 42 |
#' directory = "example", |
52 | 43 |
#' files.per.chunk = 50 |
... | ... |
@@ -73,7 +64,7 @@ GDCdownload <- function( |
73 | 64 |
stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.") |
74 | 65 |
} |
75 | 66 |
|
76 |
- source <- ifelse(query$legacy,"legacy","harmonized") |
|
67 |
+ source <- "harmonized" |
|
77 | 68 |
|
78 | 69 |
dir.create(directory, showWarnings = FALSE, recursive = TRUE) |
79 | 70 |
for(proj in unique(unlist(query$project))){ |
... | ... |
@@ -152,11 +143,7 @@ GDCdownload <- function( |
152 | 143 |
) |
153 | 144 |
} |
154 | 145 |
|
155 |
- server <- ifelse( |
|
156 |
- query$legacy, |
|
157 |
- "https://api.gdc.cancer.gov/legacy/data/", |
|
158 |
- "https://api.gdc.cancer.gov/data/" |
|
159 |
- ) |
|
146 |
+ server <- "https://api.gdc.cancer.gov/data/" |
|
160 | 147 |
|
161 | 148 |
if (is.null(files.per.chunk) & sum(as.numeric(manifest$size)) > 10^9) { |
162 | 149 |
message("The total size of files is big. We will download files in chunks") |
... | ... |
@@ -67,107 +67,52 @@ checkProjectInput <- function(project){ |
67 | 67 |
} |
68 | 68 |
} |
69 | 69 |
|
70 |
-checkLegacyPlatform <- function(project,data.category, legacy = FALSE){ |
|
71 |
- project.summary <- getProjectSummary(project, legacy) |
|
72 |
- if(missing(data.category)) { |
|
73 |
- print(knitr::kable(project.summary$data_categories)) |
|
74 |
- stop("Please set a data.category argument from the column data_category above") |
|
75 |
- } |
|
76 |
- if(!(data.category %in% project.summary$data_categories$data_category)) { |
|
77 |
- print(knitr::kable(project.summary$data_categories)) |
|
78 |
- stop("Please set a valid data.category argument from the column data_category above") |
|
79 |
- } |
|
80 |
-} |
|
70 |
+checkDataTypeInput <- function(data.type){ |
|
71 |
+ |
|
72 |
+ harmonized.data.type <- c( |
|
73 |
+ "Aggregated Somatic Mutation", |
|
74 |
+ "Aligned Reads", |
|
75 |
+ "Gene Expression Quantification", |
|
76 |
+ "Raw CGI Variant", |
|
77 |
+ "Methylation Beta Value", |
|
78 |
+ "Differential Gene Expression", |
|
79 |
+ "Splice Junction Quantification", |
|
80 |
+ "Protein Expression Quantification", |
|
81 |
+ "Annotated Somatic Mutation", |
|
82 |
+ "Raw Simple Somatic Mutation", |
|
83 |
+ "Masked Somatic Mutation", |
|
84 |
+ "Copy Number Segment", |
|
85 |
+ "Masked Intensities", |
|
86 |
+ "Allele-specific Copy Number Segment", |
|
87 |
+ "Masked Copy Number Segment", |
|
88 |
+ "Isoform Expression Quantification", |
|
89 |
+ "miRNA Expression Quantification", |
|
90 |
+ "Gene Level Copy Number", |
|
91 |
+ "Biospecimen Supplement", |
|
92 |
+ "Gene Level Copy Number Scores", |
|
93 |
+ "Protein Expression Quantification", |
|
94 |
+ "Clinical Supplement", |
|
95 |
+ "Single Cell Analysis", |
|
96 |
+ "Masked Somatic Mutation", |
|
97 |
+ "Slide Image" |
|
98 |
+ ) |
|
81 | 99 |
|
82 |
-checkDataTypeInput <- function(legacy, data.type){ |
|
83 |
- if(legacy){ |
|
84 |
- legacy.data.type <- c("Copy number segmentation", |
|
85 |
- "Raw intensities", |
|
86 |
- "Aligned reads", |
|
87 |
- "Copy number estimate", |
|
88 |
- "Simple nucleotide variation", |
|
89 |
- "Gene expression quantification", |
|
90 |
- "Coverage WIG", |
|
91 |
- "miRNA gene quantification", |
|
92 |
- "Genotypes", |
|
93 |
- "miRNA isoform quantification", |
|
94 |
- "Normalized copy numbers", |
|
95 |
- "Isoform expression quantification", |
|
96 |
- "Normalized intensities", |
|
97 |
- "Tissue slide image", |
|
98 |
- "Exon quantification", |
|
99 |
- "Exon junction quantification", |
|
100 |
- "Methylation beta value", |
|
101 |
- "Unaligned reads", |
|
102 |
- "Diagnostic image", |
|
103 |
- "CGH array QC", |
|
104 |
- "Biospecimen Supplement", |
|
105 |
- "Pathology report", |
|
106 |
- "Clinical Supplement", |
|
107 |
- "Intensities", |
|
108 |
- "Protein expression quantification", |
|
109 |
- "Microsatellite instability", |
|
110 |
- "Structural variation", |
|
111 |
- "Auxiliary test", |
|
112 |
- "Copy number QC metrics", |
|
113 |
- "Intensities Log2Ratio", |
|
114 |
- "Methylation array QC metrics", |
|
115 |
- "Clinical data", |
|
116 |
- "Copy number variation", |
|
117 |
- "ABI sequence trace", |
|
118 |
- "Protein Expression Quantification", |
|
119 |
- "Biospecimen data", |
|
120 |
- "Simple somatic mutation", |
|
121 |
- "Bisulfite sequence alignment", |
|
122 |
- "Methylation percentage", |
|
123 |
- "Sequencing tag", |
|
124 |
- "Sequencing tag counts", |
|
125 |
- "LOH") |
|
126 |
- if(!data.type %in% legacy.data.type) { |
|
127 |
- print(knitr::kable(as.data.frame(sort(legacy.data.type)))) |
|
128 |
- stop("Please set a data.type argument from the column legacy.data.type above") |
|
129 |
- } |
|
130 |
- } else { |
|
131 |
- harmonized.data.type <- c( |
|
132 |
- "Aggregated Somatic Mutation", |
|
133 |
- "Aligned Reads", |
|
134 |
- "Gene Expression Quantification", |
|
135 |
- "Raw CGI Variant", |
|
136 |
- "Methylation Beta Value", |
|
137 |
- "Differential Gene Expression", |
|
138 |
- "Splice Junction Quantification", |
|
139 |
- "Protein Expression Quantification", |
|
140 |
- "Annotated Somatic Mutation", |
|
141 |
- "Raw Simple Somatic Mutation", |
|
142 |
- "Masked Somatic Mutation", |
|
143 |
- "Copy Number Segment", |
|
144 |
- "Masked Intensities", |
|
145 |
- "Allele-specific Copy Number Segment", |
|
146 |
- "Masked Copy Number Segment", |
|
147 |
- "Isoform Expression Quantification", |
|
148 |
- "miRNA Expression Quantification", |
|
149 |
- "Gene Level Copy Number", |
|
150 |
- "Biospecimen Supplement", |
|
151 |
- "Gene Level Copy Number Scores", |
|
152 |
- "Protein Expression Quantification", |
|
153 |
- "Clinical Supplement", |
|
154 |
- "Single Cell Analysis", |
|
155 |
- "Masked Somatic Mutation", |
|
156 |
- "Slide Image") |
|
157 |
- if(!data.type %in% harmonized.data.type) { |
|
158 |
- print(knitr::kable(as.data.frame(sort(harmonized.data.type)))) |
|
159 |
- stop("Please set a data.type argument from the column harmonized.data.type above") |
|
160 |
- } |
|
100 |
+ if (!data.type %in% harmonized.data.type) { |
|
101 |
+ print(knitr::kable(as.data.frame(sort(harmonized.data.type)))) |
|
102 |
+ stop("Please set a data.type argument from the column harmonized.data.type above") |
|
161 | 103 |
} |
162 | 104 |
} |
163 | 105 |
|
164 |
-checkDataCategoriesInput <- function(project,data.category, legacy = FALSE){ |
|
106 |
+checkDataCategoriesInput <- function(project,data.category){ |
|
107 |
+ |
|
165 | 108 |
for(proj in project){ |
166 |
- project.summary <- getProjectSummary(proj, legacy) |
|
109 |
+ |
|
110 |
+ project.summary <- getProjectSummary(proj) |
|
167 | 111 |
if(missing(data.category)) { |
168 | 112 |
print(knitr::kable(project.summary$data_categories)) |
169 | 113 |
stop("Please set a data.category argument from the column data_category above") |
170 | 114 |
} |
115 |
+ |
|
171 | 116 |
if(!(data.category %in% project.summary$data_categories$data_category)) { |
172 | 117 |
print(knitr::kable(project.summary$data_categories)) |
173 | 118 |
stop("Please set a valid data.category argument from the column data_category above. We could not validade the data.category for project ", proj) |
... | ... |
@@ -618,13 +563,10 @@ get.mutation <- function( |
618 | 563 |
if(missing(genes)) stop("Argument genes is missing") |
619 | 564 |
|
620 | 565 |
# Get mutation annotation file |
621 |
- library(maftools) |
|
622 |
- library(dplyr) |
|
623 | 566 |
query <- GDCquery( |
624 | 567 |
project = project, |
625 | 568 |
data.category = "Simple Nucleotide Variation", |
626 | 569 |
access = "open", |
627 |
- legacy = FALSE, |
|
628 | 570 |
data.type = "Masked Somatic Mutation", |
629 | 571 |
workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" |
630 | 572 |
) |
... | ... |
@@ -638,8 +580,9 @@ get.mutation <- function( |
638 | 580 |
unlist( |
639 | 581 |
sapply( |
640 | 582 |
mutant_variant_classification, |
641 |
- function(x) grep(x,maf$Variant_Classification, |
|
642 |
- ignore.case = TRUE) |
|
583 |
+ function(x) { |
|
584 |
+ grep(x,maf$Variant_Classification,ignore.case = TRUE) |
|
585 |
+ } |
|
643 | 586 |
) |
644 | 587 |
) |
645 | 588 |
) |
... | ... |
@@ -648,8 +591,10 @@ get.mutation <- function( |
648 | 591 |
mut <- NULL |
649 | 592 |
for(i in genes) { |
650 | 593 |
if(!i %in% maf$Hugo_Symbol) next |
651 |
- aux <- data.frame(patient = substr(unique(maf[i == maf$Hugo_Symbol,]$Tumor_Sample_Barcode),1,15), |
|
652 |
- mut = TRUE) |
|
594 |
+ aux <- data.frame( |
|
595 |
+ patient = substr(unique(maf[i == maf$Hugo_Symbol,]$Tumor_Sample_Barcode),1,15), |
|
596 |
+ mut = TRUE |
|
597 |
+ ) |
|
653 | 598 |
colnames(aux)[2] <- paste0("mut_hg38_",i) |
654 | 599 |
if(is.null(mut)) { |
655 | 600 |
mut <- aux |
... | ... |
@@ -668,6 +613,7 @@ get.mutation <- function( |
668 | 613 |
|
669 | 614 |
return(mut) |
670 | 615 |
} |
616 |
+ |
|
671 | 617 |
get.mut.gistc <- function( |
672 | 618 |
project, |
673 | 619 |
genes, |
... | ... |
@@ -694,6 +640,7 @@ get.mut.gistc <- function( |
694 | 640 |
} else if(is.null(mut) & !is.null(cnv)) { |
695 | 641 |
return(cnv) |
696 | 642 |
} |
643 |
+ |
|
697 | 644 |
return(NULL) |
698 | 645 |
} |
699 | 646 |
get.mut.gistc.information <- function( |
... | ... |
@@ -91,7 +91,7 @@ GDCprepare <- function( |
91 | 91 |
stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost") |
92 | 92 |
} |
93 | 93 |
# We save the files in project/source/data.category/data.type/file_id/file_name |
94 |
- source <- ifelse(query$legacy,"legacy","harmonized") |
|
94 |
+ source <- "harmonized" |
|
95 | 95 |
files <- file.path( |
96 | 96 |
query$results[[1]]$project, source, |
97 | 97 |
gsub(" ","_",query$results[[1]]$data_category), |
... | ... |
@@ -174,8 +174,7 @@ GDCprepare <- function( |
174 | 174 |
files = files, |
175 | 175 |
cases = cases, |
176 | 176 |
summarizedExperiment = summarizedExperiment, |
177 |
- platform = unique(query$results[[1]]$platform), |
|
178 |
- legacy = query$legacy |
|
177 |
+ platform = unique(query$results[[1]]$platform) |
|
179 | 178 |
) |
180 | 179 |
} else if (grepl("Raw intensities|Masked Intensities",query$data.type, ignore.case = TRUE)) { |
181 | 180 |
# preparing IDAT files |
... | ... |
@@ -183,8 +182,7 @@ GDCprepare <- function( |
183 | 182 |
files = files, |
184 | 183 |
barcode = cases, |
185 | 184 |
summarizedExperiment = summarizedExperiment, |
186 |
- platform = unique(query$results[[1]]$platform), |
|
187 |
- legacy = query$legacy |
|
185 |
+ platform = unique(query$results[[1]]$platform) |
|
188 | 186 |
) |
189 | 187 |
} else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) { |
190 | 188 |
|
... | ... |
@@ -199,7 +197,7 @@ GDCprepare <- function( |
199 | 197 |
|
200 | 198 |
} else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) { |
201 | 199 |
|
202 |
- if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE) | source == "legacy"){ |
|
200 |
+ if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){ |
|
203 | 201 |
data <- readSimpleNucleotideVariationMaf(files) |
204 | 202 |
} |
205 | 203 |
|
... | ... |
@@ -212,7 +210,7 @@ GDCprepare <- function( |
212 | 210 |
files = files, |
213 | 211 |
cases = cases, |
214 | 212 |
summarizedExperiment = summarizedExperiment, |
215 |
- genome = ifelse(query$legacy,"hg19","hg38"), |
|
213 |
+ genome = "hg38", |
|
216 | 214 |
experimental.strategy = unique(query$results[[1]]$experimental_strategy) |
217 | 215 |
) |
218 | 216 |
|
... | ... |
@@ -221,7 +219,7 @@ GDCprepare <- function( |
221 | 219 |
files = files, |
222 | 220 |
cases = cases, |
223 | 221 |
summarizedExperiment = FALSE, |
224 |
- genome = ifelse(query$legacy,"hg19","hg38"), |
|
222 |
+ genome = "hg38", |
|
225 | 223 |
experimental.strategy = unique(query$results[[1]]$experimental_strategy) |
226 | 224 |
) |
227 | 225 |
|
... | ... |
@@ -713,14 +711,13 @@ readIDATDNAmethylation <- function( |
713 | 711 |
files, |
714 | 712 |
barcode, |
715 | 713 |
summarizedExperiment, |
716 |
- platform, |
|
717 |
- legacy |
|
714 |
+ platform |
|
718 | 715 |
) { |
719 | 716 |
|
720 | 717 |
check_package("sesame") |
721 | 718 |
|
722 | 719 |
# Check if moved files would be moved outside of scope folder, if so, path doesn't change |
723 |
- moved.files <- sapply(files,USE.NAMES=FALSE,function(x){ |
|
720 |
+ moved.files <- sapply(files,USE.NAMES = FALSE,function(x){ |
|
724 | 721 |
if (grepl("Raw_intensities|Masked_Intensities",dirname(dirname(x)))) { |
725 | 722 |
return(file.path(dirname(dirname(x)), basename(x))) |
726 | 723 |
} |
... | ... |
@@ -753,7 +750,7 @@ readIDATDNAmethylation <- function( |
753 | 750 |
|
754 | 751 |
betas <- makeSEFromDNAMethylationMatrix( |
755 | 752 |
betas = betas, |
756 |
- genome = ifelse(legacy,"hg19","hg38"), |
|
753 |
+ genome ="hg38", |
|
757 | 754 |
met.platform = platform |
758 | 755 |
) |
759 | 756 |
colData(betas) <- DataFrame(colDataPrepare(colnames(betas))) |
... | ... |
@@ -774,8 +771,7 @@ readDNAmethylation <- function( |
774 | 771 |
files, |
775 | 772 |
cases, |
776 | 773 |
summarizedExperiment = TRUE, |
777 |
- platform, |
|
778 |
- legacy |
|
774 |
+ platform |
|
779 | 775 |
){ |
780 | 776 |
if(length(platform) > 1){ |
781 | 777 |
|
... | ... |
@@ -847,7 +843,7 @@ readDNAmethylation <- function( |
847 | 843 |
|
848 | 844 |
df <- makeSEFromDNAMethylationMatrix( |
849 | 845 |
betas = df, |
850 |
- genome = ifelse(legacy,"hg19","hg38"), |
|
846 |
+ genome = "hg38", |
|
851 | 847 |
met.platform = platform |
852 | 848 |
) |
853 | 849 |
} |
... | ... |
@@ -1056,31 +1052,37 @@ colDataPrepareTCGA <- function(barcode){ |
1056 | 1052 |
# For the moment this will work only for TCGA Data |
1057 | 1053 |
# We should search what TARGET data means |
1058 | 1054 |
|
1059 |
- code <- c('01','02','03','04','05','06','07','08','09','10','11', |
|
1060 |
- '12','13','14','20','40','50','60','61') |
|
1061 |
- shortLetterCode <- c("TP","TR","TB","TRBM","TAP","TM","TAM","THOC", |
|
1062 |
- "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB", |
|
1063 |
- "CELL","XP","XCL") |
|
1064 |
- |
|
1065 |
- definition <- c("Primary solid Tumor", # 01 |
|
1066 |
- "Recurrent Solid Tumor", # 02 |
|
1067 |
- "Primary Blood Derived Cancer - Peripheral Blood", # 03 |
|
1068 |
- "Recurrent Blood Derived Cancer - Bone Marrow", # 04 |
|
1069 |
- "Additional - New Primary", # 05 |
|
1070 |
- "Metastatic", # 06 |
|
1071 |
- "Additional Metastatic", # 07 |
|
1072 |
- "Human Tumor Original Cells", # 08 |
|
1073 |
- "Primary Blood Derived Cancer - Bone Marrow", # 09 |
|
1074 |
- "Blood Derived Normal", # 10 |
|
1075 |
- "Solid Tissue Normal", # 11 |
|
1076 |
- "Buccal Cell Normal", # 12 |
|
1077 |
- "EBV Immortalized Normal", # 13 |
|
1078 |
- "Bone Marrow Normal", # 14 |
|
1079 |
- "Control Analyte", # 20 |
|
1080 |
- "Recurrent Blood Derived Cancer - Peripheral Blood", # 40 |
|
1081 |
- "Cell Lines", # 50 |
|
1082 |
- "Primary Xenograft Tissue", # 60 |
|
1083 |
- "Cell Line Derived Xenograft Tissue") # 61 |
|
1055 |
+ code <- c( |
|
1056 |
+ '01','02','03','04','05','06','07','08','09','10','11', |
|
1057 |
+ '12','13','14','20','40','50','60','61' |
|
1058 |
+ ) |
|
1059 |
+ shortLetterCode <- c( |
|
1060 |
+ "TP","TR","TB","TRBM","TAP","TM","TAM","THOC", |
|
1061 |
+ "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB", |
|
1062 |
+ "CELL","XP","XCL" |
|
1063 |
+ ) |
|
1064 |
+ |
|
1065 |
+ definition <- c( |
|
1066 |
+ "Primary solid Tumor", # 01 |
|
1067 |
+ "Recurrent Solid Tumor", # 02 |
|
1068 |
+ "Primary Blood Derived Cancer - Peripheral Blood", # 03 |
|
1069 |
+ "Recurrent Blood Derived Cancer - Bone Marrow", # 04 |
|
1070 |
+ "Additional - New Primary", # 05 |
|
1071 |
+ "Metastatic", # 06 |
|
1072 |
+ "Additional Metastatic", # 07 |
|
1073 |
+ "Human Tumor Original Cells", # 08 |
|
1074 |
+ "Primary Blood Derived Cancer - Bone Marrow", # 09 |
|
1075 |
+ "Blood Derived Normal", # 10 |
|
1076 |
+ "Solid Tissue Normal", # 11 |
|
1077 |
+ "Buccal Cell Normal", # 12 |
|
1078 |
+ "EBV Immortalized Normal", # 13 |
|
1079 |
+ "Bone Marrow Normal", # 14 |
|
1080 |
+ "Control Analyte", # 20 |
|
1081 |
+ "Recurrent Blood Derived Cancer - Peripheral Blood", # 40 |
|
1082 |
+ "Cell Lines", # 50 |
|
1083 |
+ "Primary Xenograft Tissue", # 60 |
|
1084 |
+ "Cell Line Derived Xenograft Tissue" |
|
1085 |
+ ) # 61 |
|
1084 | 1086 |
aux <- DataFrame(code = code,shortLetterCode,definition) |
1085 | 1087 |
|
1086 | 1088 |
# in case multiple equal barcode |
... | ... |
@@ -1088,10 +1090,12 @@ colDataPrepareTCGA <- function(barcode){ |
1088 | 1090 |
"-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}") |
1089 | 1091 |
samples <- str_match(barcode,regex)[,1] |
1090 | 1092 |
|
1091 |
- ret <- DataFrame(barcode = barcode, |
|
1092 |
- patient = substr(barcode, 1, 12), |
|
1093 |
- sample = substr(barcode, 1, 16), |
|
1094 |
- code = substr(barcode, 14, 15)) |
|
1093 |
+ ret <- DataFrame( |
|
1094 |
+ barcode = barcode, |
|
1095 |
+ patient = substr(barcode, 1, 12), |
|
1096 |
+ sample = substr(barcode, 1, 16), |
|
1097 |
+ code = substr(barcode, 14, 15) |
|
1098 |
+ ) |
|
1095 | 1099 |
ret <- merge(ret,aux, by = "code", sort = FALSE) |
1096 | 1100 |
ret <- ret[match(barcode,ret$barcode),] |
1097 | 1101 |
rownames(ret) <- gsub("\\.","-",make.names(ret$barcode,unique=TRUE)) |
... | ... |
@@ -3,7 +3,6 @@ |
3 | 3 |
#' Uses GDC API to search for search, it searches for both controlled and |
4 | 4 |
#' open-access data. |
5 | 5 |
#' For GDC data arguments project, data.category, data.type and workflow.type should be used |
6 |
-#' For the legacy data arguments project, data.category, platform and/or file.extension should be used. |
|
7 | 6 |
#' Please, see the vignette for a table with the possibilities. |
8 | 7 |
#' @param project A list of valid project (see list with TCGAbiolinks:::getGDCprojects()$project_id)] |
9 | 8 |
#' \itemize{ |
... | ... |
@@ -75,33 +74,15 @@ |
75 | 74 |
#' \item{ Simple Nucleotide Variation } |
76 | 75 |
#' \item{ Transcriptome Profiling } |
77 | 76 |
#' } |
78 |
-#' List for legacy archive |
|
79 |
-#' \itemize{ |
|
80 |
-#' \item{ Biospecimen } |
|
81 |
-#' \item{ Clinical } |
|
82 |
-#' \item{ Copy number variation } |
|
83 |
-#' \item{ DNA methylation } |
|
84 |
-#' \item{ Gene expression } |
|
85 |
-#' \item{ Protein expression } |
|
86 |
-#' \item{ Raw microarray data } |
|
87 |
-#' \item{ Raw sequencing data } |
|
88 |
-#' \item{ Simple nucleotide variation } |
|
89 |
-#' } |
|
90 | 77 |
#' @param data.type A data type to filter the files to download |
91 | 78 |
#' For the complete list please check the vignette. |
92 | 79 |
#' @param sample.type A sample type to filter the files to download |
93 | 80 |
#' @param barcode A list of barcodes to filter the files to download |
94 |
-#' @param legacy Search in the legacy repository |
|
95 | 81 |
#' @param data.format Data format filter ("VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML", |
96 | 82 |
#' "TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX") |
97 |
-#' @param file.type To be used in the legacy database for some platforms, |
|
98 |
-#' to define which file types to be used. |
|
99 | 83 |
#' @param workflow.type GDC workflow type |
100 |
-#' @param experimental.strategy Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. |
|
101 |
-#' Legacy: WXS, RNA-Seq, miRNA-Seq, Genotyping Array, |
|
102 |
-#' DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS, |
|
103 |
-#' MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array, |
|
104 |
-#' Total RNA-Seq, Capillary sequencing, Bisulfite-Seq |
|
84 |
+#' @param experimental.strategy Filter to experimental strategy. |
|
85 |
+#' Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. |
|
105 | 86 |
#' @param access Filter by access type. Possible values: controlled, open |
106 | 87 |
#' @param platform Example: |
107 | 88 |
#' \tabular{ll}{ |
... | ... |
@@ -157,19 +138,6 @@ |
157 | 138 |
#' data.type = "Masked Copy Number Segment", |
158 | 139 |
#' sample.type = c("Primary Tumor") |
159 | 140 |
#' ) |
160 |
-#' query.met <- GDCquery( |
|
161 |
-#' project = c("TCGA-GBM","TCGA-LGG"), |
|
162 |
-#' legacy = TRUE, |
|
163 |
-#' data.category = "DNA methylation", |
|
164 |
-#' platform = "Illumina Human Methylation 450" |
|
165 |
-#' ) |
|
166 |
-#' query <- GDCquery( |
|
167 |
-#' project = "TCGA-ACC", |
|
168 |
-#' data.category = "Copy number variation", |
|
169 |
-#' legacy = TRUE, |
|
170 |
-#' file.type = "hg19.seg", |
|
171 |
-#' barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01") |
|
172 |
-#' ) |
|
173 | 141 |
#' } |
174 | 142 |
#' @return A data frame with the results and the parameters used |
175 | 143 |
#' @importFrom jsonlite fromJSON |
... | ... |
@@ -183,7 +151,6 @@ GDCquery <- function( |
183 | 151 |
data.category, |
184 | 152 |
data.type, |
185 | 153 |
workflow.type, |
186 |
- legacy = FALSE, |
|
187 | 154 |
access, |
188 | 155 |
platform, |
189 | 156 |
file.type, |
... | ... |
@@ -243,11 +210,11 @@ GDCquery <- function( |
243 | 210 |
} |
244 | 211 |
}) |
245 | 212 |
print.header("GDCquery: Searching in GDC database","section") |
246 |
- message("Genome of reference: ",ifelse(legacy,"hg19","hg38")) |
|
213 |
+ message("Genome of reference: hg38") |
|
247 | 214 |
# Check arguments |
248 | 215 |
checkProjectInput(project) |
249 |
- checkDataCategoriesInput(project, data.category, legacy) |
|
250 |
- if(!is.na(data.type)) checkDataTypeInput(legacy = legacy, data.type = data.type) |
|
216 |
+ checkDataCategoriesInput(project, data.category) |
|
217 |
+ if(!is.na(data.type)) checkDataTypeInput(data.type = data.type) |
|
251 | 218 |
if(!any(is.na(sample.type))) checkBarcodeDefinition(sample.type) |
252 | 219 |
|
253 | 220 |
results <- NULL |
... | ... |
@@ -257,7 +224,6 @@ GDCquery <- function( |
257 | 224 |
project = proj, |
258 | 225 |
data.category = data.category, |
259 | 226 |
data.type = data.type, |
260 |
- legacy = legacy, |
|
261 | 227 |
workflow.type = workflow.type, |
262 | 228 |
platform = platform, |
263 | 229 |
file.type = file.type, |
... | ... |
@@ -279,7 +245,6 @@ GDCquery <- function( |
279 | 245 |
project = proj, |
280 | 246 |
data.category = data.category, |
281 | 247 |
data.type = data.type, |
282 |
- legacy = legacy, |
|
283 | 248 |
workflow.type = NA, |
284 | 249 |
platform = NA, |
285 | 250 |
file.type = file.type, |
... | ... |
@@ -621,17 +586,6 @@ GDCquery <- function( |
621 | 586 |
message("ooo By sample.type") |
622 | 587 |
results <- results[tolower(results$sample_type) %in% tolower(sample.type),] |
623 | 588 |
} |
624 |
- # some how there are duplicated files in GDC we should remove them |
|
625 |
- # Example of problematic query |
|
626 |
- # query.exp <- GDCquery(project = "TCGA-BRCA", |
|
627 |
- # legacy = TRUE, |
|
628 |
- # data.category = "Gene expression", |
|
629 |
- # data.type = "Gene expression quantification", |
|
630 |
- # platform = "Illumina HiSeq", |
|
631 |
- # file.type = "results", |
|
632 |
- # experimental_strategy = "RNA-Seq", |
|
633 |
- # sample.type = c("Primary solid Tumor","Solid Tissue Normal")) |
|
634 |
- # |
|
635 | 589 |
print.header("Checking data","subsection") |
636 | 590 |
|
637 | 591 |
message("ooo Checking if there are duplicated cases") |
... | ... |
@@ -665,7 +619,6 @@ GDCquery <- function( |
665 | 619 |
project = I(list(project)), |
666 | 620 |
data.category = data.category, |
667 | 621 |
data.type = data.type, |
668 |
- legacy = legacy, |
|
669 | 622 |
access = I(list(access)), |
670 | 623 |
experimental.strategy = I(list(experimental.strategy)), |
671 | 624 |
file.type = file.type, |
... | ... |
@@ -677,37 +630,41 @@ GDCquery <- function( |
677 | 630 |
return(ret) |
678 | 631 |
} |
679 | 632 |
|
680 |
-getGDCquery <- function(project, data.category, data.type, legacy, workflow.type,platform,file.type,files.access,sample.type,experimental.strategy){ |
|
633 |
+getGDCquery <- function( |
|
634 |
+ project, |
|
635 |
+ data.category, |
|
636 |
+ data.type, |
|
637 |
+ workflow.type, |
|
638 |
+ platform, |
|
639 |
+ file.type, |
|
640 |
+ files.access, |
|
641 |
+ sample.type, |
|
642 |
+ experimental.strategy |
|
643 |
+){ |
|
681 | 644 |
# Get manifest using the API |
682 |
- baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/files/?","https://api.gdc.cancer.gov/files/?") |
|
645 |
+ baseURL <- "https://api.gdc.cancer.gov/files/?" |
|
683 | 646 |
options.pretty <- "pretty=true" |
684 |
- if(data.category == "Protein expression" & legacy) { |
|
685 |
- options.expand <- "fields=archive.revision,archive.file_name,md5sum,state,data_category,file_id,platform,file_name,file_size,md5sum,submitter_id,data_type&expand=cases.samples.portions,cases.project,center,analysis" |
|
686 |
- } else if(data.category %in% c("Clinical","Biospecimen")) { |
|
647 |
+ if(data.category %in% c("Clinical","Biospecimen")) { |
|
687 | 648 |
options.expand <- "expand=cases,cases.project,center,analysis" |
688 | 649 |
} else { |
689 | 650 |
options.expand <- "expand=cases,cases.samples.portions.analytes.aliquots,cases.project,center,analysis,cases.samples" |
690 | 651 |
} |
691 |
- option.size <- paste0("size=",getNbFiles(project,data.category,legacy)) |
|
652 |
+ option.size <- paste0("size=",getNbFiles(project,data.category)) |
|
692 | 653 |
option.format <- paste0("format=JSON") |
693 | 654 |
|
694 |
- options.filter <- paste0("filters=", |
|
695 |
- URLencode('{"op":"and","content":['), # Start json request |
|
696 |
- URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'), |
|
697 |
- project, |
|
698 |
- URLencode('"]}}')) |
|
655 |
+ options.filter <- paste0( |
|
656 |
+ "filters=", |
|
657 |
+ URLencode('{"op":"and","content":['), # Start json request |
|
658 |
+ URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'), |
|
659 |
+ project, |
|
660 |
+ URLencode('"]}}') |
|
661 |
+ ) |
|
699 | 662 |
|
700 |
- if(!is.na(experimental.strategy)) options.filter <- paste0(options.filter,addFilter("files.experimental_strategy", experimental.strategy)) |
|
663 |
+ if(!is.na(experimental.strategy)) options.filter <- paste0(options.filter,addFilter("files.experimental_strategy", experimental.strategy)) |
|
701 | 664 |
if(!is.na(data.category)) options.filter <- paste0(options.filter,addFilter("files.data_category", data.category)) |
702 | 665 |
if(!is.na(data.type)) options.filter <- paste0(options.filter,addFilter("files.data_type", data.type)) |
703 | 666 |
if(!is.na(workflow.type)) options.filter <- paste0(options.filter,addFilter("files.analysis.workflow_type", workflow.type)) |
704 | 667 |
if(!any(is.na(platform))) options.filter <- paste0(options.filter,addFilter("files.platform", platform)) |
705 |
- if(!any(is.na(file.type))) { |
|
706 |
- if(file.type == "results" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "unnormalized")) |
|
707 |
- if(file.type == "normalized_results" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "normalized")) |
|
708 |
- if(file.type == "nocnv_hg19.seg" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "nocnv")) |
|
709 |
- if(file.type == "hg19.isoform" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "hg19")) |
|
710 |
- } |
|
711 | 668 |
if(!any(is.na(files.access))) { |
712 | 669 |
options.filter <- paste0(options.filter,addFilter("files.access", files.access)) |
713 | 670 |
} |
... | ... |
@@ -1028,12 +985,11 @@ GDCquery_ATAC_seq <- function( |
1028 | 985 |
results$data_category <- "ATAC-seq" |
1029 | 986 |
results$project <- "ATAC-seq" |
1030 | 987 |
ret <- data.frame( |
1031 |
- results=I(list(results)), |
|
988 |
+ results = I(list(results)), |
|
1032 | 989 |
tumor = I(list(tumor)), |
1033 | 990 |
project = I(list("ATAC-seq")), |
1034 | 991 |
data.type = I(list("ATAC-seq")), |
1035 |
- data.category = I(list("ATAC-seq")), |
|
1036 |
- legacy = I(list(FALSE)) |
|
992 |
+ data.category = I(list("ATAC-seq")) |
|
1037 | 993 |
) |
1038 | 994 |
|
1039 | 995 |
return(ret) |
... | ... |
@@ -871,7 +871,6 @@ unlistlabels <- function(lab) { |
871 | 871 |
#' @importFrom data.table dcast setDT setDF := |
872 | 872 |
#' @examples |
873 | 873 |
#' \dontrun{ |
874 |
-#' library(maftools) |
|
875 | 874 |
#' library(dplyr) |
876 | 875 |
#' query <- GDCquery( |
877 | 876 |
#' project = "TCGA-CHOL", |
... | ... |
@@ -929,7 +928,6 @@ TCGAvisualize_oncoprint <- function( |
929 | 928 |
annotation.legend.side = "bottom" |
930 | 929 |
){ |
931 | 930 |
|
932 |
- |
|
933 | 931 |
check_package("ComplexHeatmap") |
934 | 932 |
check_package("circlize") |
935 | 933 |
check_package("grid") |
... | ... |
@@ -34,15 +34,6 @@ Uses GDC API or GDC transfer tool to download gdc data |
34 | 34 |
The data from query will be save in a folder: project/data.category |
35 | 35 |
} |
36 | 36 |
\examples{ |
37 |
-query <- GDCquery( |
|
38 |
- project = "TCGA-ACC", |
|
39 |
- data.category = "Copy number variation", |
|
40 |
- legacy = TRUE, |
|
41 |
- file.type = "hg19.seg", |
|
42 |
- barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01") |
|
43 |
- ) |
|
44 |
-# data will be saved in GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation |
|
45 |
-GDCdownload(query, method = "api") |
|
46 | 37 |
\dontrun{ |
47 | 38 |
# Download clinical data from XML |
48 | 39 |
query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical") |
... | ... |
@@ -57,14 +48,14 @@ GDCdownload(query, method = "api") |
57 | 48 |
# data will be saved in: |
58 | 49 |
# example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification |
59 | 50 |
GDCdownload(query, method = "client", directory = "example_data_dir") |
60 |
- acc.gbm <- GDCquery( |
|
51 |
+ query_acc_gbm <- GDCquery( |
|
61 | 52 |
project = c("TCGA-ACC","TCGA-GBM"), |
62 | 53 |
data.category = "Transcriptome Profiling", |
63 | 54 |
data.type = "Gene Expression Quantification", |
64 | 55 |
workflow.type = "STAR - Counts" |
65 | 56 |
) |
66 | 57 |
GDCdownload( |
67 |
- query = acc.gbm, |
|
58 |
+ query = query_acc_gbm, |
|
68 | 59 |
method = "api", |
69 | 60 |
directory = "example", |
70 | 61 |
files.per.chunk = 50 |
... | ... |
@@ -9,7 +9,6 @@ GDCquery( |
9 | 9 |
data.category, |
10 | 10 |
data.type, |
11 | 11 |
workflow.type, |
12 |
- legacy = FALSE, |
|
13 | 12 |
access, |
14 | 13 |
platform, |
15 | 14 |
file.type, |
... | ... |
@@ -90,18 +89,6 @@ List for harmonized database: |
90 | 89 |
\item{ Sequencing Reads } |
91 | 90 |
\item{ Simple Nucleotide Variation } |
92 | 91 |
\item{ Transcriptome Profiling } |
93 |
-} |
|
94 |
-List for legacy archive |
|
95 |
-\itemize{ |
|
96 |
-\item{ Biospecimen } |
|
97 |
-\item{ Clinical } |
|
98 |
-\item{ Copy number variation } |
|
99 |
-\item{ DNA methylation } |
|
100 |
-\item{ Gene expression } |
|
101 |
-\item{ Protein expression } |
|
102 |
-\item{ Raw microarray data } |
|
103 |
-\item{ Raw sequencing data } |
|
104 |
-\item{ Simple nucleotide variation } |
|
105 | 92 |
}} |
106 | 93 |
|
107 | 94 |
\item{data.type}{A data type to filter the files to download |
... | ... |
@@ -109,8 +96,6 @@ For the complete list please check the vignette.} |
109 | 96 |
|
110 | 97 |
\item{workflow.type}{GDC workflow type} |
111 | 98 |
|
112 |
-\item{legacy}{Search in the legacy repository} |
|
113 |
- |
|
114 | 99 |
\item{access}{Filter by access type. Possible values: controlled, open} |
115 | 100 |
|
116 | 101 |
\item{platform}{Example: |
... | ... |
@@ -140,19 +125,13 @@ HumanMethylation27 \tab Mixed_DNASeq_Cont_curated \cr |
140 | 125 |
IlluminaHiSeq_RNASeqV2 \tab Mixed_DNASeq_Cont |
141 | 126 |
}} |
142 | 127 |
|
143 |
-\item{file.type}{To be used in the legacy database for some platforms, |
|
144 |
-to define which file types to be used.} |
|
145 |
- |
|
146 | 128 |
\item{barcode}{A list of barcodes to filter the files to download} |
147 | 129 |
|
148 | 130 |
\item{data.format}{Data format filter ("VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML", |
149 | 131 |
"TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX")} |
150 | 132 |
|
151 |
-\item{experimental.strategy}{Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. |
|
152 |
-Legacy: WXS, RNA-Seq, miRNA-Seq, Genotyping Array, |
|
153 |
-DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS, |
|
154 |
-MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array, |
|
155 |
-Total RNA-Seq, Capillary sequencing, Bisulfite-Seq} |
|
133 |
+\item{experimental.strategy}{Filter to experimental strategy. |
|
134 |
+Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.} |
|
156 | 135 |
|
157 | 136 |
\item{sample.type}{A sample type to filter the files to download} |
158 | 137 |
} |
... | ... |
@@ -163,7 +142,6 @@ A data frame with the results and the parameters used |
163 | 142 |
Uses GDC API to search for search, it searches for both controlled and |
164 | 143 |
open-access data. |
165 | 144 |
For GDC data arguments project, data.category, data.type and workflow.type should be used |
166 |
- For the legacy data arguments project, data.category, platform and/or file.extension should be used. |
|
167 | 145 |
Please, see the vignette for a table with the possibilities. |
168 | 146 |
} |
169 | 147 |
\examples{ |
... | ... |
@@ -193,19 +171,6 @@ query <- GDCquery( |
193 | 171 |
data.type = "Masked Copy Number Segment", |
194 | 172 |
sample.type = c("Primary Tumor") |
195 | 173 |
) |
196 |
-query.met <- GDCquery( |
|
197 |
- project = c("TCGA-GBM","TCGA-LGG"), |
|
198 |
- legacy = TRUE, |
|
199 |
- data.category = "DNA methylation", |
|
200 |
- platform = "Illumina Human Methylation 450" |
|
201 |
-) |
|
202 |
-query <- GDCquery( |
|
203 |
- project = "TCGA-ACC", |
|
204 |
- data.category = "Copy number variation", |
|
205 |
- legacy = TRUE, |
|
206 |
- file.type = "hg19.seg", |
|
207 |
- barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01") |
|
208 |
-) |
|
209 | 174 |
} |
210 | 175 |
} |
211 | 176 |
\author{ |
... | ... |
@@ -1,17 +1,16 @@ |
1 |
-context("Download AND PREPARE") |
|
2 |
- |
|
3 |
- |
|
1 |
+context("Download and prepare") |
|
4 | 2 |
|
5 | 3 |
test_that("GDCdownload API method is working ", { |
6 | 4 |
skip_on_bioc() |
7 | 5 |
skip_if_offline() |
8 | 6 |
|
9 |
- cases <- c( |
|
7 |
+ cases <- c( |
|
10 | 8 |
"TCGA-PA-A5YG-01A-11R-A29S-07", |
11 | 9 |
"TCGA-OR-A5JX-01A-11R-A29S-07", |
12 | 10 |
"TCGA-PK-A5HA-01A-11R-A29S-07", |
13 | 11 |
"TCGA-OR-A5KY-01A-11R-A29S-07" |
14 | 12 |
) |
13 |
+ |
|
15 | 14 |
acc <- GDCquery( |
16 | 15 |
project = c("TCGA-ACC"), |
17 | 16 |
data.category = "Transcriptome Profiling", |
... | ... |
@@ -20,8 +19,8 @@ test_that("GDCdownload API method is working ", { |
20 | 19 |
barcode = substr(cases,1,12) |
21 | 20 |
) |
22 | 21 |
GDCdownload(acc, method = "api", directory = "ex") |
23 |
- |
|
24 | 22 |
obj <- GDCprepare(acc, directory = "ex",summarizedExperiment = TRUE) |
23 |
+ |
|
25 | 24 |
expect_true(all(substr(colnames(obj),1,12) == substr(cases,1,12))) |
26 | 25 |
expect_true(all(obj$barcode == cases)) |
27 | 26 |
|
... | ... |
@@ -46,9 +45,6 @@ test_that("GDCdownload API method is working ", { |
46 | 45 |
expect_true(all(query$results[[1]]$sample.submitter_id == data$sample_submitter_id)) |
47 | 46 |
}) |
48 | 47 |
|
49 |
- |
|
50 |
- |
|
51 |
- |
|
52 | 48 |
test_that("getBarcodeInfo works", { |
53 | 49 |
skip_on_bioc() |
54 | 50 |
skip_if_offline() |
... | ... |
@@ -61,11 +57,14 @@ test_that("getBarcodeInfo works", { |
61 | 57 |
x <- getBarcodeInfo(c("TARGET-20-PARUDL-03A")) |
62 | 58 |
expect_true(all(cols %in% colnames(x))) |
63 | 59 |
|
64 |
- samples <- c("HCM-CSHL-0063-C18-85A", |
|
65 |
- "HCM-CSHL-0065-C20-06A", |
|
66 |
- "HCM-CSHL-0065-C20-85A", |
|
67 |
- "HCM-CSHL-0063-C18-01A") |
|
60 |
+ samples <- c( |
|
61 |
+ "HCM-CSHL-0063-C18-85A", |
|
62 |
+ "HCM-CSHL-0065-C20-06A", |
|
63 |
+ "HCM-CSHL-0065-C20-85A", |
|
64 |
+ "HCM-CSHL-0063-C18-01A" |
|
65 |
+ ) |
|
68 | 66 |
x <- colDataPrepare(samples) |
67 |
+ |
|
69 | 68 |
expect_true(all(rownames(x) == samples)) |
70 | 69 |
expect_true(x[x$sample_submitter_id == "HCM-CSHL-0065-C20-06A","gender"] == "male") |
71 | 70 |
expect_true(x[x$sample_submitter_id == "HCM-CSHL-0065-C20-06A","tumor_grade"] == "G2") |
... | ... |
@@ -102,22 +101,29 @@ test_that("colDataPrepare handle replicates", { |
102 | 101 |
test_that("GDCprepare accepts more than one project", { |
103 | 102 |
skip_on_bioc() |
104 | 103 |
skip_if_offline() |
105 |
- cases <- c("TCGA-OR-A5JX-01A", "TCGA-OR-A5J3-01A", |
|
106 |
- "TCGA-06-0680-11A","TCGA-14-0871-01A") |
|
104 |
+ cases <- c( |
|
105 |
+ "TCGA-OR-A5JX-01A", |
|
106 |
+ "TCGA-OR-A5J3-01A", |
|
107 |
+ "TCGA-06-0680-11A", |
|
108 |
+ "TCGA-14-0871-01A" |
|
109 |
+ ) |
|
107 | 110 |
expect_true(all(c("TCGA-ACC","TCGA-GBM") %in% colDataPrepare(cases)$project_id)) |
108 |
- acc.gbm <- GDCquery(project = c("TCGA-ACC","TCGA-GBM"), |
|
109 |
- data.category = "Transcriptome Profiling", |
|
110 |
- data.type = "Gene Expression Quantification", |
|
111 |
- workflow.type = "STAR - Counts", |
|
112 |
- barcode = substr(cases,1,12)) |
|
113 |
- GDCdownload(acc.gbm, method = "api", directory = "ex") |
|
114 |
- obj <- GDCprepare(acc.gbm, directory = "ex") |
|
111 |
+ query_acc_gbm <- GDCquery( |
|
112 |
+ project = c("TCGA-ACC","TCGA-GBM"), |
|
113 |
+ data.category = "Transcriptome Profiling", |
|
114 |
+ data.type = "Gene Expression Quantification", |
|
115 |
+ workflow.type = "STAR - Counts", |
|
116 |
+ barcode = substr(cases, 1, 12) |
|
117 |
+ ) |
|
118 |
+ GDCdownload(query_acc_gbm, method = "api", directory = "ex") |
|
119 |
+ obj <- GDCprepare(query_acc_gbm, directory = "ex") |
|
115 | 120 |
expect_true(all(c("TCGA-ACC","TCGA-GBM") %in% SummarizedExperiment::colData(obj)$project_id)) |
116 | 121 |
}) |
117 | 122 |
|
118 | 123 |
test_that("Non TCGA data is processed", { |
119 | 124 |
skip_on_bioc() |
120 | 125 |
skip_if_offline() |
126 |
+ |
|
121 | 127 |
proj <- "MMRF-COMMPASS" |
122 | 128 |
query <- GDCquery( |
123 | 129 |
project = proj, |
... | ... |
@@ -132,8 +138,6 @@ test_that("Non TCGA data is processed", { |
132 | 138 |
workflow.type = "STAR - Counts", |
133 | 139 |
barcode = getResults(query)$cases[1:4] |
134 | 140 |
) |
135 |
- #GDCdownload(query) |
|
136 |
- #data <- GDCprepare(query) |
|
137 | 141 |
}) |
138 | 142 |
|
139 | 143 |
test_that("Gene Level Copy Number is being correctly prepare", { |
... | ... |
@@ -151,7 +155,7 @@ test_that("Gene Level Copy Number is being correctly prepare", { |
151 | 155 |
data <- GDCprepare(query,directory = "ex") |
152 | 156 |
|
153 | 157 |
expect_true(all(substr(colnames(data),1,12) == c("TCGA-OR-A5JD","TCGA-OR-A5J7"))) |
154 |
- unlink("ex",recursive = TRUE,force = TRUE) |
|
158 |
+ unlink("ex", recursive = TRUE, force = TRUE) |
|
155 | 159 |
}) |
156 | 160 |
|
157 | 161 |
test_that("DNAm files is processed correctly", { |
... | ... |
@@ -170,28 +174,6 @@ test_that("DNAm files is processed correctly", { |
170 | 174 |
expect_lt(abs(assay(data.hg38)["cg16739396","TCGA-E2-A158-01A-11D-A12E-05"] - 0.0688655418909783),10^-10) |
171 | 175 |
}) |
172 | 176 |
|
173 |
-test_that("IDAT files is processed", { |
|
174 |
- skip_on_bioc() |
|
175 |
- skip_if_offline() |
|
176 |
- |
|
177 |
- proj <- "TCGA-LUAD" |
|
178 |
- query <- GDCquery( |
|
179 |
- project = proj, |
|
180 |
- data.category = "Raw microarray data", |
|
181 |
- data.type = "Raw intensities", |
|
182 |
- experimental.strategy = "Methylation array", |
|
183 |
- legacy = TRUE, |
|
184 |
- file.type = ".idat", |
|
185 |
- barcode = "TCGA-55-7724", |
|
186 |
- platform = "Illumina Human Methylation 450" |
|
187 |
- ) |
|
188 |
- #tryCatch(GDCdownload(query, method = "api", files.per.chunk = 20), |
|
189 |
- # error = function(e) GDCdownload(query, method = "client")) |
|
190 |
- #betas <- GDCprepare(query) |
|
191 |
- #expect_true(nrow(betas) == 485577) |
|
192 |
- #expect_true(ncol(betas) == 1) |
|
193 |
-}) |
|
194 |
- |
|
195 | 177 |
test_that("Prepare samples without clinical data", { |
196 | 178 |
skip_on_bioc() |
197 | 179 |
skip_if_offline() |
... | ... |
@@ -214,30 +196,10 @@ test_that("Prepare multiple samples from the same patient", { |
214 | 196 |
expect_true("age_at_diagnosis" %in% colnames(x)) |
215 | 197 |
}) |
216 | 198 |
|
217 |
-test_that("Preparing HT_HG-U133A as SE works", { |
|
218 |
- skip_on_bioc() |
|
219 |
- skip_if_offline() |
|
220 |
- |
|
221 |
- query <- GDCquery( |
|
222 |
- project = "TCGA-GBM", |
|
223 |
- legacy = TRUE, |
|
224 |
- data.category = "Gene expression", |
|
225 |
- data.type = "Gene expression quantification", |
|
226 |
- platform = c("HT_HG-U133A") |
|
227 |
- ) |
|
228 |
- query$results[[1]] <- query$results[[1]][1:2,] |
|
229 |
- GDCdownload(query, method = "api", files.per.chunk = 100) |
|
230 |
- se <- GDCprepare(query, summarizedExperiment = TRUE) |
|
231 |
- |
|
232 |
- expect_true(is(se,"SummarizedExperiment")) |
|
233 |
-}) |
|
234 |
- |
|
235 |
- |
|
236 | 199 |
test_that("Preparing RRPA files with number of proteins works", { |
237 | 200 |
skip_on_bioc() |
238 | 201 |
skip_if_offline() |
239 | 202 |
|
240 |
- |
|
241 | 203 |
query_rppa <- GDCquery( |
242 | 204 |
project = c("TCGA-COAD"), |
243 | 205 |
data.category = "Proteome Profiling", |
... | ... |
@@ -249,9 +211,12 @@ test_that("Preparing RRPA files with number of proteins works", { |
249 | 211 |
|
250 | 212 |
GDCdownload(query_rppa) |
251 | 213 |
|
252 |
- expect_message(object = { |
|
253 |
- data_rppa <- GDCprepare(query_rppa) |
|
254 |
- },regexp = "Some files have a different number of proteins, we will introduce NA for the missing values") |
|
214 |
+ expect_message( |
|
215 |
+ object = { |
|
216 |
+ data_rppa <- GDCprepare(query_rppa) |
|
217 |
+ }, |
|
218 |
+ regexp = "Some files have a different number of proteins, we will introduce NA for the missing values" |
|
219 |
+ ) |
|
255 | 220 |
|
256 | 221 |
expect_true(is(data_rppa,"data.frame")) |
257 | 222 |
}) |
... | ... |
@@ -11,7 +11,7 @@ test_that("TCGAquery_SampleTypes returns the correct barcodes", { |
11 | 11 |
|
12 | 12 |
test_that("GDCquery_clinic populates correctly the data", { |
13 | 13 |
skip_on_bioc() |
14 |
- results <- GDCquery_clinic( "BEATAML1.0-COHORT") |
|
14 |
+ results <- GDCquery_clinic(project = "BEATAML1.0-COHORT") |
|
15 | 15 |
results.2028 <- results[results$submitter_id == "2028",] |
16 | 16 |
expect_equal(results.2028$vital_status,"Alive") |
17 | 17 |
expect_true( |
... | ... |
@@ -27,7 +27,7 @@ test_that("GDCquery_clinic populates correctly the data", { |
27 | 27 |
expect_equal(results.42$ethnicity,"not hispanic or latino") |
28 | 28 |
expect_equal(as.integer(results.2028$age_at_diagnosis %>% as.numeric() / 365.25),56) |
29 | 29 |
|
30 |
- results <- GDCquery_clinic( "TCGA-LUAD") |
|
30 |
+ results <- GDCquery_clinic(project = "TCGA-LUAD") |
|
31 | 31 |
results.sample <- results[results$submitter_id == "TCGA-80-5608",] |
32 | 32 |
expect_equal(results.sample$vital_status,"Alive") |
33 | 33 |
expect_equal(results.sample$gender,"female") |
... | ... |
@@ -20,16 +20,19 @@ test_that("GDCquery accepts more than one project", { |
20 | 20 |
data.category = "Copy Number Variation", |
21 | 21 |
data.type = "Copy Number Segment" |
22 | 22 |
) |
23 |
+ |
|
23 | 24 |
gbm <- GDCquery( |
24 | 25 |
project = "TCGA-GBM", |
25 | 26 |
data.category = "Copy Number Variation", |
26 | 27 |
data.type = "Copy Number Segment" |
27 | 28 |
) |
29 |
+ |
|
28 | 30 |
acc.gbm <- GDCquery( |
29 | 31 |
project = c("TCGA-ACC","TCGA-GBM"), |
30 | 32 |
data.category = "Copy Number Variation", |
31 | 33 |
data.type = "Copy Number Segment" |
32 | 34 |
) |
35 |
+ |
|
33 | 36 |
expect_equal(unique(acc.gbm$results[[1]]$data_type),"Copy Number Segment") |
34 | 37 |
expect_equal(nrow(acc.gbm$results[[1]]), sum(nrow(acc$results[[1]]),nrow(gbm$results[[1]]))) |
35 | 38 |
expect_true(nrow(dplyr::anti_join(acc$results[[1]],acc.gbm$results[[1]], by = "file_id")) == 0) |
... | ... |
@@ -51,34 +54,24 @@ test_that("GDCquery can filter by sample.type", { |
51 | 54 |
expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type) |
52 | 55 |
|
53 | 56 |
sample.type <- "Solid Tissue Normal" |
54 |
- query <- GDCquery(project = "TCGA-ACC", |
|
55 |
- data.category = "Copy Number Variation", |
|
56 |
- data.type = "Masked Copy Number Segment", |
|
57 |
- sample.type = sample.type) |
|
58 |
- expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type) |
|
59 |
- |
|
60 |
- sample.type <- "Solid Tissue Normal" |
|
61 |
- query <- GDCquery(project = c("TCGA-COAD"), |
|
62 |
- data.category = "Transcriptome Profiling", |
|
63 |
- data.type = "Gene Expression Quantification", |
|
64 |
- workflow.type = "STAR - Counts", |
|
65 |
- sample.type = sample.type) |
|
57 |
+ query <- GDCquery( |
|
58 |
+ project = "TCGA-ACC", |
|
59 |
+ data.category = "Copy Number Variation", |
|
60 |
+ data.type = "Masked Copy Number Segment", |
|
61 |
+ sample.type = sample.type |
|
62 |
+ ) |
|
66 | 63 |
expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type) |
67 | 64 |
|
68 |
- |
|
69 | 65 |
sample.type <- "Solid Tissue Normal" |
70 |
- query <- GDCquery(project = "TCGA-BRCA", |
|
71 |
- legacy = TRUE, |
|
72 |
- data.category = "Gene expression", |
|
73 |
- data.type = "Gene expression quantification", |
|
74 |
- platform = "Illumina HiSeq", |
|
75 |
- file.type = "results", |
|
76 |
- experimental.strategy = "RNA-Seq", |
|
77 |
- sample.type = sample.type) |
|
66 |
+ query <- GDCquery( |
|
67 |
+ project = c("TCGA-COAD"), |
|
68 |
+ data.category = "Transcriptome Profiling", |
|
69 |
+ data.type = "Gene Expression Quantification", |
|
70 |
+ workflow.type = "STAR - Counts", |
|
71 |
+ sample.type = sample.type |
|
72 |
+ ) |
|
78 | 73 |
expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type) |
79 | 74 |
|
80 |
- |
|
81 |
- |
|
82 | 75 |
sample.type <- c("Solid Tissue Normal", "Primary Tumor") |
83 | 76 |
query <- GDCquery( |
84 | 77 |
project = "TCGA-ACC", |
... | ... |
@@ -121,56 +114,6 @@ test_that("GDCquery can filter by barcode", { |
121 | 114 |
expect_true(!all(c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") %in% query$results[[1]]$cases)) |
122 | 115 |
}) |
123 | 116 |
|
124 |
-test_that("GDCquery can filter copy number from legacy data by file type. Case: nocnv_hg18", { |
|
125 |
- skip_on_bioc() |
|
126 |
- skip_if_offline() |
|
127 |
- |
|
128 |
- query <- GDCquery(project = "TCGA-ACC", |
|
129 |
- data.category = "Copy number variation", |
|
130 |
- legacy = TRUE, |
|
131 |
- file.type = "nocnv_hg18.seg", |
|
132 |
- barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")) |
|
133 |
- expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.nocnv_hg18.seg.txt") |
|
134 |
-}) |
|
135 |
- |
|
136 |
-test_that("GDCquery can filter copy number from legacy data by file type. Case: hg18", { |
|
137 |
- skip_on_bioc() |
|
138 |
- skip_if_offline() |
|
139 |
- |
|
140 |
- query <- GDCquery(project = "TCGA-ACC", |
|
141 |
- data.category = "Copy number variation", |
|
142 |
- legacy = TRUE, |
|
143 |
- file.type = "hg18.seg", |
|
144 |
- barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")) |
|
145 |
- expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.hg18.seg.txt") |
|
146 |
-}) |
|
147 |
- |
|
148 |
-test_that("GDCquery can filter copy number from legacy data by file type. Case: hg19", { |
|
149 |
- skip_on_bioc() |
|
150 |
- skip_if_offline() |
|
151 |
- |
|
152 |
- query <- GDCquery(project = "TCGA-ACC", |
|
153 |
- data.category = "Copy number variation", |
|
154 |
- legacy = TRUE, |
|
155 |
- file.type = "hg19.seg", |
|
156 |
- barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")) |
|
157 |
- expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.hg19.seg.txt") |
|
158 |
-}) |
|
159 |
- |
|
160 |
- |
|
161 |
-test_that("GDCquery can filter copy number from legacy data by file type. Case: nocnv_hg19", { |
|
162 |
- skip_on_bioc() |
|
163 |
- skip_if_offline() |
|
164 |
- |
|
165 |
- query <- GDCquery(project = "TCGA-ACC", |
|
166 |
- data.category = "Copy number variation", |
|
167 |
- legacy = TRUE, |
|
168 |
- file.type = "nocnv_hg19.seg", |
|
169 |
- barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")) |
|
170 |
- expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.nocnv_hg19.seg.txt") |
|
171 |
- |
|
172 |
-}) |
|
173 |
- |
|
174 | 117 |
|
175 | 118 |
test_that("GDCquery can filter by access level", { |
176 | 119 |
skip_on_bioc() |
... | ... |
@@ -186,15 +129,12 @@ test_that("GDCquery can filter by access level", { |
186 | 129 |
expect_equal(unique(query$results[[1]]$access),"controlled") |
187 | 130 |
}) |
188 | 131 |
|
189 |
- |
|
190 |
- |
|
191 |
- |
|
192 | 132 |
test_that("getNbFiles and getNbCases works", { |
193 | 133 |
skip_on_bioc() |
194 | 134 |
skip_if_offline() |
195 | 135 |
|
196 | 136 |
aux <- getProjectSummary("TCGA-LUAD",TRUE) |
197 |
- files <- getNbFiles("TCGA-LUAD","Raw microarray data",legacy = T) |
|
137 |
+ files <- getNbFiles("TCGA-LUAD","Raw microarray data") |
|
198 | 138 |
cases <- getNbCases("TCGA-LUAD","Raw microarray data") |
199 | 139 |
expect_true(cases < files) |
200 | 140 |
}) |
... | ... |
@@ -72,18 +72,8 @@ which defines the output type a Summarized Experiment (default option) or a data |
72 | 72 |
To create a summarized Experiment object we annotate the data with genomic positions |
73 | 73 |
with last patch release version of the genome available. |
74 | 74 |
|
75 |
-For legacy data (data aligned to hg19) TCGAbiolinks is using GRCh37.p13 and for |
|
76 |
-harmonized data (data aligned to hg38) now it is using Gencode version 36. |
|
77 | 75 |
|
78 |
-Unfortunately, some of the updates changes/remove gene symbols, change coordinates, etc. |
|
79 |
-Which might introduce some loss of data. For example, if the gene was removed we cannot map |
|
80 |
-it anymore and that information will be lost in the `SummarizedExperiment`. |
|
81 |
- |
|
82 |
-If you set `SummarizedExperiment` to `FALSE`, you will get the data unmodified |
|
83 |
-just as they are in the files and ad your own annotation. |
|
84 |
- |
|
85 |
-Also, there are no updated for DNA methylation data. But the last metadata available can be found |
|
86 |
-here: [http://zwdzwd.github.io/InfiniumAnnotation](http://zwdzwd.github.io/InfiniumAnnotation) |
|
76 |
+Also, the latest DNA methylation metadata is available at: [http://zwdzwd.github.io/InfiniumAnnotation](http://zwdzwd.github.io/InfiniumAnnotation) |
|
87 | 77 |
|
88 | 78 |
</div> |
89 | 79 |
</div> |
... | ... |
@@ -132,48 +122,6 @@ in `GDCprepare` and `GDCdownload` |
132 | 122 |
| mut.pipeline | If add.gistic2.mut is not NULL this field will be taken in consideration. Four separate variant calling pipelines are implemented for GDC data harmonization. Options: muse, varscan2, somaticsniper, MuTect2. For more information: https://gdc-docs.nci.nih.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/ | |
133 | 123 |
| mutant_variant_classification | List of mutant_variant_classification that will be consider a sample mutant or not. Default: "Frame_Shift_Del", "Frame_Shift_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del", "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation" | |
134 | 124 |
|
135 |
-## Search and download data from legacy database using GDC api method |
|
136 |
- |
|
137 |
-In this example we will download gene expression data from legacy database (data |
|
138 |
-aligned against genome of reference hg19) using GDC api method and we will show object data and metadata. |
|
139 |
-```{r results = 'hide', message=FALSE, warning=FALSE, eval = F} |
|
140 |
-query <- GDCquery( |
|
141 |
- project = "TCGA-GBM", |
|
142 |
- data.category = "Gene expression", |
|
143 |
- data.type = "Gene expression quantification", |
|
144 |
- platform = "Illumina HiSeq", |
|
145 |
- file.type = "normalized_results", |
|
146 |
- experimental.strategy = "RNA-Seq", |
|
147 |
- barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"), |
|
148 |
- legacy = TRUE |
|
149 |
-) |
|
150 |
-GDCdownload( |
|
151 |
- query = query, |
|
152 |
- method = "api", |
|
153 |
- files.per.chunk = 10 |
|
154 |
-) |
|
155 |
-data <- GDCprepare(query = query) |
|
156 |
-``` |
|
157 |
- |
|
158 |
-```{r message=FALSE, warning=FALSE, include=FALSE} |
|
159 |
-data <- gbm.exp.legacy |
|
160 |
-``` |
|
161 |
- |
|
162 |
-```{r message=FALSE, warning=FALSE} |
|
163 |
-# Gene expression aligned against hg19. |
|
164 |
-datatable( |
|
165 |
- as.data.frame(colData(data)), |
|
166 |
- options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), |
|
167 |
- rownames = FALSE) |
|
168 |
-# Only first 20 rows to make render faster |
|
169 |
-datatable( |
|
170 |
- assay(data)[1:20,], |
|
171 |
- options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), |
|
172 |
- rownames = TRUE |
|
173 |
-) |
|
174 |
- |
|
175 |
-rowRanges(data) |
|
176 |
-``` |
|
177 | 125 |
|
178 | 126 |
|
179 | 127 |
## Search and download data for two samples from database |
... | ... |
@@ -238,44 +186,6 @@ Examples of query, download, prepare can be found in this [gist](https://gist.gi |
238 | 186 |
| Biospecimen | Biospecimen Supplement | | | |
239 | 187 |
| Clinical | | | | |
240 | 188 |
|
241 |
-## Legacy data |
|
242 |
-| Data.category | Data.type | Platform | file.type | Status | |
|
243 |
-|-----------------------------|-----------------------------------|-------------------------------------|--------------------|-----------------| |
|
244 |
-| Transcriptome Profiling | | | | | |
|
245 |
-| Copy number variation | - | Affymetrix SNP Array 6.0 | nocnv_hg18.seg | Working | |
|
246 |
-| | - | Affymetrix SNP Array 6.0 | hg18.seg | Working | |
|
247 |
-| | - | Affymetrix SNP Array 6.0 | nocnv_hg19.seg | Working | |
|
248 |
-| | - | Affymetrix SNP Array 6.0 | hg19.seg | Working | |
|
249 |
-| | - | Illumina HiSeq | Several | Working | |
|
250 |
-| Simple Nucleotide Variation | Simple somatic mutation | | | | |
|
251 |
-| Raw Sequencing Data | | | | | |
|
252 |
-| Biospecimen | | | | | |
|
253 |
-| Clinical | | | | | |
|
254 |
-| Protein expression | | MDA RPPA Core | - | Working | |
|
255 |
-| Gene expression | Gene expression quantification | Illumina HiSeq | normalized_results | Working | |
|
256 |
-| | | Illumina HiSeq | results | Working | |
|
257 |
-| | | HT_HG-U133A | - | Working | |
|
258 |
-| | | AgilentG4502A_07_2 | - | Data frame only | |
|
259 |
-| | | AgilentG4502A_07_1 | - | Data frame only | |
|
260 |
-| | | HuEx-1_0-st-v2 | FIRMA.txt | Not Preparing | |
|
261 |
-| | | | gene.txt | Not Preparing | |
|
262 |
-| | Isoform expression quantification | | | | |
|
263 |
-| | miRNA gene quantification | | | | |
|
264 |
-| | Exon junction quantification | | | | |
|
265 |
-| | Exon quantification | | | | |
|
266 |
-| | miRNA isoform quantification | | | | |
|
267 |
-| | | | | | |
|
268 |
-| DNA methylation | | Illumina Human Methylation 450 | Not used | Working | |
|
269 |
-| | | Illumina Human Methylation 27 | Not used | Working | |
|
270 |
-| | | Illumina DNA Methylation OMA003 CPI | Not used | Working | |
|
271 |
-| | | Illumina DNA Methylation OMA002 CPI | Not used | Working | |
|
272 |
-| | | Illumina Hi Seq | | Not working | |
|
273 |
-| Raw Microarray Data | | | | | |
|
274 |
-| Structural Rearrangement | | | | | |
|
275 |
-| Other | | | | | |
|
276 |
- |
|
277 |
- |
|
278 |
- |
|
279 | 189 |
# Examples |
280 | 190 |
|
281 | 191 |
|
... | ... |
@@ -444,8 +354,7 @@ query <- GDCquery( |
444 | 354 |
project = "TCGA-BRCA", |
445 | 355 |
data.category = "DNA Methylation", |
446 | 356 |
data.type = "Masked Intensities", |
447 |
- platform = "Illumina Human Methylation 27", |
|
448 |
- legacy = FALSE |
|
357 |
+ platform = "Illumina Human Methylation 27" |
|
449 | 358 |
) |
450 | 359 |
GDCdownload(query, files.per.chunk=10) |
451 | 360 |
betas <- GDCprepare(query) |
... | ... |
@@ -454,10 +363,9 @@ query <- GDCquery( |
454 | 363 |
project = "HCMI-CMDC", |
455 | 364 |
data.category = "DNA Methylation", |
456 | 365 |
data.type = "Masked Intensities", |
457 |
- platform = "Illumina Methylation Epic", |
|
458 |
- legacy = FALSE |
|
366 |
+ platform = "Illumina Methylation Epic" |
|
459 | 367 |
) |
460 |
-GDCdownload(query, files.per.chunk=10) |
|
368 |
+GDCdownload(query, files.per.chunk = 10) |
|
461 | 369 |
betas <- GDCprepare(query) |
462 | 370 |
|
463 | 371 |
|
... | ... |
@@ -465,8 +373,7 @@ query <- GDCquery( |
465 | 373 |
project = "CPTAC-3", |
466 | 374 |
data.category = "DNA Methylation", |
467 | 375 |
data.type = "Masked Intensities", |
468 |
- platform = "Illumina Methylation Epic", |
|
469 |
- legacy = FALSE |
|
376 |
+ platform = "Illumina Methylation Epic" |
|
470 | 377 |
) |
471 | 378 |
GDCdownload(query, files.per.chunk=10) |
472 | 379 |
betas <- GDCprepare(query) |
... | ... |
@@ -475,10 +382,9 @@ query <- GDCquery( |
475 | 382 |
project = "TCGA-BRCA", |
476 | 383 |
data.category = "DNA Methylation", |
477 | 384 |
data.type = "Masked Intensities", |
478 |
- platform = "Illumina Methylation Epic", |
|
479 |
- legacy = FALSE |
|
385 |
+ platform = "Illumina Methylation Epic" |
|
480 | 386 |
) |
481 |
-GDCdownload(query, files.per.chunk=10) |
|
387 |
+GDCdownload(query, files.per.chunk = 10) |
|
482 | 388 |
betas <- GDCprepare(query) |
483 | 389 |
|
484 | 390 |
|
... | ... |
@@ -571,7 +477,6 @@ https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeli |
571 | 477 |
query.sc.analysis <- GDCquery( |
572 | 478 |
project = "CPTAC-3", |
573 | 479 |
data.category = "Transcriptome Profiling", |
574 |
- legacy = FALSE, |
|
575 | 480 |
access = "open", |
576 | 481 |
data.type = "Single Cell Analysis", |
577 | 482 |
data.format = "TSV" |
... | ... |
@@ -584,7 +489,6 @@ Single.Cell.Analysis.list <- GDCprepare(query.sc.analysis) |
584 | 489 |
query.hdF5 <- GDCquery( |
585 | 490 |
project = "CPTAC-3", |
586 | 491 |
data.category = "Transcriptome Profiling", |
587 |
- legacy = FALSE, |
|
588 | 492 |
access = "open", |
589 | 493 |
data.type = "Single Cell Analysis", |
590 | 494 |
barcode = c("CPT0167860015","CPT0206880004"), |
... | ... |
@@ -598,7 +502,6 @@ df.HDF5 <- GDCprepare(query.hdF5) |
598 | 502 |
query.raw.counts <- GDCquery( |
599 | 503 |
project = "CPTAC-3", |
600 | 504 |
data.category = "Transcriptome Profiling", |
601 |
- legacy = FALSE, |
|
602 | 505 |
access = "open", |
603 | 506 |
data.type = "Gene Expression Quantification", |
604 | 507 |
barcode = c("CPT0167860015","CPT0206880004"), |
... | ... |
@@ -612,7 +515,6 @@ raw.counts.list <- GDCprepare(query.raw.counts) |
612 | 515 |
query.filtered.counts <- GDCquery( |
613 | 516 |
project = "CPTAC-3", |
614 | 517 |
data.category = "Transcriptome Profiling", |
615 |
- legacy = FALSE, |
|
616 | 518 |
access = "open", |
617 | 519 |
data.type = "Gene Expression Quantification", |
618 | 520 |
barcode = c("CPT0167860015","CPT0206880004"), |
... | ... |
@@ -627,7 +529,6 @@ filtered.counts.list <- GDCprepare(query.filtered.counts) |
627 | 529 |
query.sc.dea <- GDCquery( |
628 | 530 |
project = "CPTAC-3", |
629 | 531 |
data.category = "Transcriptome Profiling", |
630 |
- legacy = FALSE, |
|
631 | 532 |
access = "open", |
632 | 533 |
data.type = "Differential Gene Expression", |
633 | 534 |
barcode = c("CPT0167860015","CPT0206880004"), |
... | ... |
@@ -636,91 +537,3 @@ query.sc.dea <- GDCquery( |
636 | 537 |
GDCdownload(query.sc.dea) |
637 | 538 |
sc.dea.list <- GDCprepare(query.sc.dea) |
638 | 539 |
``` |
639 |
- |
|
640 |
-## Legacy archive: data aligned against hg19 |
|
641 |
- |
|
642 |
-### DNA methylation: Get all TCGA IDAT files |
|
643 |
- |
|
644 |
-```{r message=FALSE, warning=FALSE, eval =FALSE} |
|
645 |
-#------------------------------------------------------- |
|
646 |
-# Example to idat files from TCGA projects |
|
647 |
-#------------------------------------------------------- |
|
648 |
-projects <- TCGAbiolinks:::getGDCprojects()$project_id |
|
649 |
-projects <- projects[grepl('^TCGA',projects,perl=T)] |
|
650 |
-match.file.cases.all <- NULL |
|
651 |
-for(proj in projects){ |
|
652 |
- print(proj) |
|
653 |
- query <- GDCquery( |
|
654 |
- project = proj, |
|
655 |
- data.category = "Raw microarray data", |
|
656 |
- data.type = "Raw intensities", |
|
657 |
- experimental.strategy = "Methylation array", |
|
658 |
- legacy = TRUE, |
|
659 |
- file.type = ".idat", |
|
660 |
- platform = "Illumina Human Methylation 450" |
|
661 |
- ) |
|
662 |
- match.file.cases <- getResults(query,cols=c("cases","file_name")) |
|
663 |
- match.file.cases$project <- proj |
|
664 |
- match.file.cases.all <- rbind(match.file.cases.all,match.file.cases) |
|
665 |
- tryCatch( |
|
666 |
- GDCdownload(query, method = "api", files.per.chunk = 20), |
|
667 |
- error = function(e) GDCdownload(query, method = "client") |
|
668 |
- ) |
|
669 |
-} |
|
670 |
-# This will create a map between idat file name, cases (barcode) and project |
|
671 |
-readr::write_tsv(match.file.cases.all, path = "idat_filename_case.txt") |
|
672 |
-# code to move all files to local folder |
|
673 |
-for(file in dir(".",pattern = ".idat", recursive = T)){ |
|
674 |
- TCGAbiolinks::move(file,basename(file)) |
|
675 |
-} |
|
676 |
-``` |
|
677 |
- |
|
678 |
- |
|
679 |
-### DNA methylation |
|
680 |
- |
|
681 |
-```{r, eval = FALSE} |
|
682 |
-query_meth.hg19 <- GDCquery( |
|
683 |
- project= "TCGA-LGG", |
|
684 |
- data.category = "DNA methylation", |
|
685 |
- platform = "Illumina Human Methylation 450", |
|
686 |
- barcode = c("TCGA-HT-8111-01A-11D-2399-05","TCGA-HT-A5R5-01A-11D-A28N-05"), |
|
687 |
- legacy = TRUE |
|
688 |
-) |
|
689 |
-GDCdownload(query_meth.hg19) |
|
690 |
-data.hg19 <- GDCprepare(query_meth.hg19) |
|
691 |
-``` |
|
692 |
- |
|
693 |
- |
|
694 |
-### Protein expression |
|
695 |
-```{r, eval = FALSE} |
|
696 |
-query <- GDCquery( |
|
697 |
- project = "TCGA-GBM", |
|
698 |
- data.category = "Protein expression", |
|
699 |
- legacy = TRUE, |
|
700 |
- barcode = c("TCGA-OX-A56R-01A-21-A44T-20","TCGA-08-0357-01A-21-1898-20") |
|
701 |
-) |
|
702 |
-GDCdownload(query) |
|
703 |
-data <- GDCprepare( |
|
704 |
- query, save = TRUE, |
|
705 |
- save.filename = "gbmProteinExpression.rda", |
|
706 |
- remove.files.prepared = TRUE |
|
707 |
-) |
|
708 |
-``` |
|
709 |
- |
|
710 |
- |
|
711 |
-### Gene expression |
|
712 |
-```{r, eval = FALSE} |
|
713 |
-# Aligned against Hg19 |
|
714 |
-query.exp.hg19 <- GDCquery( |
|
715 |
- project = "TCGA-GBM", |
|
716 |
- data.category = "Gene expression", |
|
717 |
- data.type = "Gene expression quantification", |
|
718 |
- platform = "Illumina HiSeq", |
|
719 |
- file.type = "normalized_results", |
|
720 |
- experimental.strategy = "RNA-Seq", |
|
721 |
- barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"), |
|
722 |
- legacy = TRUE |
|
723 |
-) |
|
724 |
-GDCdownload(query.exp.hg19) |
|
725 |
-data <- GDCprepare(query.exp.hg19) |
|
726 |
-``` |
... | ... |
@@ -18,8 +18,6 @@ knitr::opts_knit$set(progress = FALSE) |
18 | 18 |
|
19 | 19 |
|
20 | 20 |
**TCGAbiolinks** has provided a few functions to search GDC database. |
21 |
-This section starts by explaining the different GDC sources (Harmonized and Legacy Archive), followed by some examples |
|
22 |
-how to access them. |
|
23 | 21 |
|
24 | 22 |
|
25 | 23 |
--- |
... | ... |
@@ -33,23 +31,6 @@ library(DT) |
33 | 31 |
|
34 | 32 |
# Useful information |
35 | 33 |
|
36 |
-<div class="panel panel-info"> |
|
37 |
-<div class="panel-heading">Different sources: Legacy vs Harmonized</div> |
|
38 |
-<div class="panel-body"> |
|
39 |
- |
|
40 |
- |
|
41 |
-There are two available sources to download GDC data using TCGAbiolinks: |
|
42 |
- |
|
43 |
-- GDC Legacy Archive : provides access to an unmodified copy of data that was previously stored in |
|
44 |
-[CGHub](https://cghub.ucsc.edu/) and in the TCGA Data Portal hosted by the TCGA Data Coordinating Center (DCC), in which uses |
|
45 |
-as references GRCh37 (hg19) and GRCh36 (hg18). |
|
46 |
-- GDC harmonized database: data available was harmonized against GRCh38 (hg38) using GDC Bioinformatics Pipelines |
|
47 |
-which provides methods to the standardization of biospecimen and |
|
48 |
-clinical data. |
|
49 |
- |
|
50 |
-</div> |
|
51 |
-</div> |
|
52 |
- |
|
53 | 34 |
|
54 | 35 |
<div class="panel panel-info"> |
55 | 36 |
<div class="panel-heading">Understanding the barcode</div> |
... | ... |
@@ -79,7 +60,6 @@ with the following arguments: |
79 | 60 |
| data.category | A valid project (see list with TCGAbiolinks:::getProjectSummary(project)) | | |
80 | 61 |
| data.type | A data type to filter the files to download | | |
81 | 62 |
| workflow.type | GDC workflow type | | |
82 |
-| legacy | Search in the legacy repository | | |
|
83 | 63 |
| access | Filter by access type. Possible values: controlled, open | | |
84 | 64 |
| platform | Example: | | |
85 | 65 |
| | CGH- 1x1M_G4447A | IlluminaGA_RNASeqV2 | |
... | ... |
@@ -107,7 +87,7 @@ with the following arguments: |
107 | 87 |
| | IlluminaHiSeq_RNASeqV2 | Mixed_DNASeq_Cont | |
108 | 88 |
| file.type | To be used in the legacy database for some platforms, to define which file types to be used. | | |
109 | 89 |
| barcode | A list of barcodes to filter the files to download | | |
110 |
-| experimental.strategy | Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. Legacy: WXS, RNA-Seq, miRNA-Seq, Genotyping Array, DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS, MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array, Total RNA-Seq, Capillary sequencing, Bisulfite-Seq | | |
|
90 |
+| experimental.strategy | Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. | | |
|
111 | 91 |
| sample.type | A sample type to filter the files to download | | |
112 | 92 |
|
113 | 93 |
|
... | ... |
@@ -138,7 +118,7 @@ datatable( |
138 | 118 |
The other fields (data.category, data.type, workflow.type, platform, file.type) can be found below. |
139 | 119 |
Please, note that these tables are still incomplete. |
140 | 120 |
|
141 |
-## Harmonized data options (`legacy = FALSE`) |
|
121 |
+## Harmonized data options |
|
142 | 122 |
|
143 | 123 |
```{r, echo=FALSE} |
144 | 124 |
datatable( |
... | ... |
@@ -149,21 +129,12 @@ datatable( |
149 | 129 |
) |
150 | 130 |
``` |
151 | 131 |
|
152 |
-## Legacy archive data options (`legacy = TRUE`) |
|
153 |
-```{r, echo=FALSE} |
|
154 |
-datatable( |
|
155 |
- readr::read_csv("https://docs.google.com/spreadsheets/d/1f98kFdj9mxVDc1dv4xTZdx8iWgUiDYO-qiFJINvmTZs/export?format=csv&gid=1817673686",col_types = readr::cols()), |
|
156 |
- filter = 'top', |
|
157 |
- options = list(scrollX = TRUE, keys = TRUE, pageLength = 40), |
|
158 |
- rownames = FALSE |
|
159 |
-) |
|
160 |
-``` |
|
161 | 132 |
|
162 | 133 |
# Harmonized database examples |
163 | 134 |
|
164 | 135 |
## DNA methylation data: Recurrent tumor samples |
165 | 136 |
|
166 |
-In this example we will access the harmonized database (`legacy = FALSE`) |
|
137 |
+In this example we will access the harmonized database |
|
167 | 138 |
and search for all DNA methylation data for recurrent glioblastoma multiform (GBM) |
168 | 139 |
and low grade gliomas (LGG) samples. |
169 | 140 |
|
... | ... |
@@ -172,7 +143,6 @@ and low grade gliomas (LGG) samples. |
172 | 143 |
query <- GDCquery( |
173 | 144 |
project = c("TCGA-GBM", "TCGA-LGG"), |
174 | 145 |
data.category = "DNA Methylation", |
175 |
- legacy = FALSE, |
|
176 | 146 |
platform = c("Illumina Human Methylation 450"), |
177 | 147 |
sample.type = "Recurrent Tumor" |
178 | 148 |
) |
... | ... |
@@ -186,19 +156,18 @@ datatable( |
186 | 156 |
|
187 | 157 |
## Samples with DNA methylation and gene expression data |
188 | 158 |
|
189 |
-In this example we will access the harmonized database (`legacy = FALSE`) |
|
159 |
+In this example we will access the harmonized database |
|
190 | 160 |
and search for all patients with DNA methylation (platform HumanMethylation450k) and gene expression data |
191 | 161 |
for Colon Adenocarcinoma tumor (TCGA-COAD). |
192 | 162 |
|
193 | 163 |
|
194 | 164 |
```{r message=FALSE, warning = FALSE, eval = FALSE} |
195 |
-query.met <- GDCquery( |
|
165 |
+query_met <- GDCquery( |
|
196 | 166 |
project = "TCGA-COAD", |
197 | 167 |
data.category = "DNA Methylation", |
198 |
- legacy = FALSE, |
|
199 | 168 |
platform = c("Illumina Human Methylation 450") |
200 | 169 |
) |
201 |
-query.exp <- GDCquery( |
|
170 |
+query_exp <- GDCquery( |
|
202 | 171 |
project = "TCGA-COAD", |
203 | 172 |
data.category = "Transcriptome Profiling", |
204 | 173 |
data.type = "Gene Expression Quantification", |
... | ... |
@@ -207,20 +176,19 @@ query.exp <- GDCquery( |
207 | 176 |
|
208 | 177 |
# Get all patients that have DNA methylation and gene expression. |
209 | 178 |
common.patients <- intersect( |
210 |
- substr(getResults(query.met, cols = "cases"), 1, 12), |
|
211 |
- substr(getResults(query.exp, cols = "cases"), 1, 12) |
|
179 |
+ substr(getResults(query_met, cols = "cases"), 1, 12), |
|
180 |
+ substr(getResults(query_exp, cols = "cases"), 1, 12) |
|
212 | 181 |
) |
213 | 182 |
|
214 | 183 |
# Only seelct the first 5 patients |
215 |
-query.met <- GDCquery( |
|
184 |
+query_met <- GDCquery( |
|
216 | 185 |
project = "TCGA-COAD", |
217 | 186 |
data.category = "DNA Methylation", |
218 |
- legacy = FALSE, |
|
219 | 187 |
platform = c("Illumina Human Methylation 450"), |
220 | 188 |
barcode = common.patients[1:5] |
221 | 189 |
) |
222 | 190 |
|
223 |
-query.exp <- GDCquery( |
|
191 |
+query_exp <- GDCquery( |
|
224 | 192 |
project = "TCGA-COAD", |
225 | 193 |
data.category = "Transcriptome Profiling", |
226 | 194 |
data.type = "Gene Expression Quantification", |
... | ... |
@@ -231,13 +199,13 @@ query.exp <- GDCquery( |
231 | 199 |
|
232 | 200 |
```{r results_matched, message=FALSE, warning=FALSE, eval = FALSE} |
233 | 201 |
datatable( |
234 |
- getResults(query.met, cols = c("data_type","cases")), |
|
202 |
+ getResults(query_met, cols = c("data_type","cases")), |
|
235 | 203 |
filter = 'top', |
236 | 204 |
options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), |
237 | 205 |
rownames = FALSE |
238 | 206 |
) |
239 | 207 |
datatable( |
240 |
- getResults(query.exp, cols = c("data_type","cases")), |
|
208 |
+ getResults(query_exp, cols = c("data_type","cases")), |
|
241 | 209 |
filter = 'top', |
242 | 210 |
options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), |
243 | 211 |
rownames = FALSE |
... | ... |
@@ -327,98 +295,13 @@ datatable( |
327 | 295 |
``` |
328 | 296 |
|
329 | 297 |
|
330 |
-# Legacy archive examples |
|
331 |
- |
|
332 |
-## DNA methylation |
|
333 |
- |
|
334 |
-### Array-based assays |
|
335 |
- |
|
336 |
-This example shows how the user can search for glioblastoma multiform (GBM) |
|
337 |
-and DNA methylation data |
|
338 |
-for platform Illumina Human Methylation 450 and Illumina Human Methylation 27. |
|
339 |
- |
|
340 |
-```{r message=FALSE, warning=FALSE} |
|
341 |
-query <- GDCquery( |
|
342 |
- project = c("TCGA-GBM"), |
|
343 |
- legacy = TRUE, |
|
344 |
- data.category = "DNA methylation", |
|
345 |
- platform = c("Illumina Human Methylation 450", "Illumina Human Methylation 27") |
|
346 |
-) |
|
347 |
-datatable( |
|
348 |
- getResults(query, rows = 1:100), |
|
349 |
- filter = 'top', |
|
350 |
- options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), |
|
351 |
- rownames = FALSE |
|
352 |
-) |
|
353 |
-``` |
|
354 |
- |
|
355 |
-### whole-genome bisulfite sequencing (WGBS) |
|
356 |
- |
|
357 |
-```{r message = FALSE, warning = FALSE, eval = FALSE} |
|
358 |
- |
|
359 |
-query <- GDCquery( |
|
360 |
- project = c("TCGA-LUAD"), |
|
361 |
- legacy = TRUE, |
|
362 |
- data.category = "DNA methylation", |
|
363 |
- data.type = "Methylation percentage", |
|
364 |
- experimental.strategy = "Bisulfite-Seq" |
|
365 |
-) |
|
366 |
- |
|
367 |
-# VCF - controlled data |
|
368 |
-query <- GDCquery( |
|
369 |
- project = c("TCGA-LUAD"), |
|
370 |
- legacy = TRUE, |
|
371 |
- data.category = "DNA methylation", |
|
372 |
- data.type = "Bisulfite sequence alignment", |
|
373 |
- experimental.strategy = "Bisulfite-Seq" |
|
374 |
-) |
|
375 |
- |
|
376 |
- |
|
377 |
-# WGBS BAM files - controlled data |
|
378 |
-query <- GDCquery( |
|
379 |
- project = c("TCGA-LUAD"), |
|
380 |
- legacy = TRUE, |
|
381 |
- data.type = "Aligned reads", |
|
382 |
- data.category = "Raw sequencing data", |
|
383 |
- experimental.strategy = "Bisulfite-Seq" |
|
384 |
-) |
|
385 |
-``` |
|
386 |
- |
|
387 |
- |
|
388 |
-## Gene expression |
|
389 |
- |
|
390 |
-This exmaple shows how the user can search for glioblastoma multiform (GBM) |
|
391 |
-gene expression data with the normalized results for expression of a gene. |
|
392 |
-For more information about file.types check [GDC TCGA file types](https://gdc.cancer.gov/resources-tcga-users/legacy-archive-tcga-tag-descriptions) |
|
393 |
- |
|
394 |
-```{r message=FALSE, warning=FALSE} |
|
395 |
-# Gene expression aligned against hg19. |
|
396 |
-query.exp.hg19 <- GDCquery( |
|
397 |
- project = "TCGA-GBM", |
|
398 |
- data.category = "Gene expression", |
|
399 |
- data.type = "Gene expression quantification", |
|
400 |
- platform = "Illumina HiSeq", |
|
401 |
- file.type = "normalized_results", |
|
402 |
- experimental.strategy = "RNA-Seq", |
|
403 |
- barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"), |
|
404 |
- legacy = TRUE |
|
405 |
-) |
|
406 |
- |
|
407 |
-datatable( |
|
408 |
- getResults(query.exp.hg19), |
|
409 |
- filter = 'top', |
|
410 |
- options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), |
|
411 |
- rownames = FALSE |
|
412 |
-) |
|
413 |
-``` |
|
414 |
- |
|
415 | 298 |
# Get Manifest file |
416 | 299 |
|
417 | 300 |
If you want to get the manifest file from the query object you can use the function *getManifest*. If you |
418 |
-set save to TRUEm a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version [ddt-ui](https://github.com/NCI-GDC/dtt-ui) will be created. |
|
301 |
+set save to `TRUE` a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version [ddt-ui](https://github.com/NCI-GDC/dtt-ui) will be created. |
|
419 | 302 |
|
420 | 303 |
```{r message=FALSE, warning=FALSE} |
421 |
-getManifest(query.exp.hg19,save = FALSE) |
|
304 |
+getManifest(query,save = FALSE) |
|
422 | 305 |
``` |
423 | 306 |
|
424 | 307 |
# ATAC-seq data |
... | ... |
@@ -440,10 +323,10 @@ datatable( |
440 | 323 |
You can use the function `GDCquery_ATAC_seq` filter the manifest table and use `GDCdownload` to save the data locally. |
441 | 324 |
```{r message=FALSE, warning=FALSE,eval = FALSE} |
442 | 325 |
query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "rds") |
443 |
-GDCdownload(query,method = "client") |
|
326 |
+GDCdownload(query, method = "client") |
|
444 | 327 |
|
445 | 328 |
query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "bigWigs") |
446 |
-GDCdownload(query,method = "client") |
|
329 |
+GDCdownload(query, method = "client") |
|
447 | 330 |
|
448 | 331 |
``` |
449 | 332 |
|