Bioconductor Code: TCGAbiolinks

Browse code

Stating to remove legacy archive from TCGAbiolinks, since it will be shutdown by GDC

Tiago Silva authored on 06/05/2023 15:00:13
Showing 16 changed files

DESCRIPTION index 31d9c76d..f3358f98 100644
NEWS index 61ec4e71..458f3daa 100644
R/clinical.R index 6773831c..1419a5a1 100644
R/download.R index e0dc437b..be812ff9 100644
R/internal.R index bea2f02d..dd35d78a 100644
R/prepare.R index 55cbbbcf..fbfc3f17 100644
R/query.R index 3b394d7a..5a86d06f 100644
R/visualize.R index 488417ed..8da71a19 100644
man/GDCdownload.Rd index 004555f1..cd66a992 100644
man/GDCquery.Rd index d3b5e524..10153d84 100644
man/TCGAvisualize_oncoprint.Rd index 001f1124..505608cd 100644
tests/testthat/test-prepare-download.R index 6a58c48e..6b5ee2f9 100644
tests/testthat/test-query-clinical.R index d35b3d32..e55ead60 100644
tests/testthat/test-query.R index a8925365..f51c2297 100644
vignettes/download_prepare.Rmd index 39af10ab..8194b68d 100644
vignettes/query.Rmd index ac3e6a43..3dc0400a 100644

History View file @ c8404f5f

@@ -1,7 +1,7 @@
                      Package: TCGAbiolinks
                      Type: Package
                      Title: TCGAbiolinks: An R/Bioconductor package for integrative analysis with GDC data
                     -Version: 2.29.0
                     +Version: 2.29.1
                      Date: 2022-17-08
                      Author: Antonio Colaprico,
                          Tiago Chedraoui Silva,

NEWS

History View file @ c8404f5f

@@ -1,3 +1,8 @@
                     +CHANGES IN VERSION 2.29.1
                     +-------------------------
+                    +
                     +* Removing support to legacy archive since it will be shutdown by GDC soon.
+                    +
                      CHANGES IN VERSION 2.21.1
                      -------------------------

R/clinical.R

History View file @ c8404f5f

@@ -344,13 +344,16 @@ GDCquery_clinic <- function(
                                                  } else {
                                                      # HTMCP-03-06-02061 has two diagnosis
                                                      x$submitter_id <- gsub("_diagnosis.*","",x$submitter_id)
                     +                                # If there are two rows for the same submitter_id
                     +                                # we will collapse them into one single row
                     +                                # concatanating all columns using ;
                                                      aux <- x %>% dplyr::group_by(submitter_id) %>%
                     -                                    dplyr::summarise_each(funs(paste(unique(.), collapse = ";")))
                     +                                    summarise(across(everything(),~ paste(unique(.), collapse = ";")))
                                                      aux$treatments <- list(dplyr::bind_rows(x$treatments))
                                                      aux
+                                                 }
+                                             }
                     -                    ),fill = T
                     +                    ), fill = TRUE
+                                     )
                                      #df$submitter_id <- gsub("^d|_diagnosis|diag-|-DX|-DIAG|-diagnosis","", df$submitter_id)
                                      # ^d ORGANOID-PANCREATIC
@@ -500,7 +503,7 @@ GDCprepare_clinic <- function(
+                         }
                          # Get all the clincal xml files
                     -    source <- ifelse(query$legacy,"legacy","harmonized")
                     +    source <- "harmonized"
                          files <- file.path(
                              query$results[[1]]$project, source,
                              gsub(" ","_",query$results[[1]]$data_category),

R/download.R

History View file @ c8404f5f

@@ -16,15 +16,6 @@
                      #' @importFrom methods is
                      #' @export
                      #' @examples
                     -#' query <- GDCquery(
                     -#'   project = "TCGA-ACC",
                     -#'   data.category =  "Copy number variation",
                     -#'   legacy = TRUE,
                     -#'   file.type = "hg19.seg",
                     -#'   barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01")
                     -#'  )
                     -#' # data will be saved in  GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation
                     -#' GDCdownload(query, method = "api")
                      #' \dontrun{
                      #'     # Download clinical data from XML
                      #'     query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical")
@@ -39,14 +30,14 @@
                      #'     # data will be saved in:
                      #'     # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification
                      #'     GDCdownload(query, method = "client", directory = "example_data_dir")
                     -#'     acc.gbm <- GDCquery(
                     +#'     query_acc_gbm <- GDCquery(
                      #'         project =  c("TCGA-ACC","TCGA-GBM"),
                      #'         data.category = "Transcriptome Profiling",
                      #'         data.type = "Gene Expression Quantification",
                      #'         workflow.type = "STAR - Counts"
                      #'     )
                      #'     GDCdownload(
                     -#'        query = acc.gbm,
                     +#'        query = query_acc_gbm,
                      #'        method = "api",
                      #'        directory = "example",
                      #'        files.per.chunk = 50
@@ -73,7 +64,7 @@ GDCdownload <- function(
                              stop("We can only download one data type. Please use data.type argument in GDCquery to filter results.")
+                         }
                     -    source <- ifelse(query$legacy,"legacy","harmonized")
                     +    source <- "harmonized"
                          dir.create(directory, showWarnings = FALSE, recursive = TRUE)
                          for(proj in unique(unlist(query$project))){
@@ -152,11 +143,7 @@ GDCdownload <- function(
+                                     )
+                                 }
                     -            server <- ifelse(
                     -                query$legacy,
                     -                "https://api.gdc.cancer.gov/legacy/data/",
                     -                "https://api.gdc.cancer.gov/data/"
                     -            )
                     +            server <- "https://api.gdc.cancer.gov/data/"
                                  if (is.null(files.per.chunk) & sum(as.numeric(manifest$size)) > 10^9) {
                                      message("The total size of files is big. We will download files in chunks")

R/internal.R

History View file @ c8404f5f

@@ -67,107 +67,52 @@ checkProjectInput <- function(project){
+                         }
+                     }
                     -checkLegacyPlatform <- function(project,data.category, legacy = FALSE){
                     -    project.summary <- getProjectSummary(project, legacy)
                     -    if(missing(data.category)) {
                     -        print(knitr::kable(project.summary$data_categories))
                     -        stop("Please set a data.category argument from the column data_category above")
                     -    }
                     -    if(!(data.category %in% project.summary$data_categories$data_category)) {
                     -        print(knitr::kable(project.summary$data_categories))
                     -        stop("Please set a valid data.category argument from the column data_category above")
                     -    }
                     -}
                     +checkDataTypeInput <- function(data.type){
+                    +
                     +    harmonized.data.type <- c(
                     +        "Aggregated Somatic Mutation",
                     +        "Aligned Reads",
                     +        "Gene Expression Quantification",
                     +        "Raw CGI Variant",
                     +        "Methylation Beta Value",
                     +        "Differential Gene Expression",
                     +        "Splice Junction Quantification",
                     +        "Protein Expression Quantification",
                     +        "Annotated Somatic Mutation",
                     +        "Raw Simple Somatic Mutation",
                     +        "Masked Somatic Mutation",
                     +        "Copy Number Segment",
                     +        "Masked Intensities",
                     +        "Allele-specific Copy Number Segment",
                     +        "Masked Copy Number Segment",
                     +        "Isoform Expression Quantification",
                     +        "miRNA Expression Quantification",
                     +        "Gene Level Copy Number",
                     +        "Biospecimen Supplement",
                     +        "Gene Level Copy Number Scores",
                     +        "Protein Expression Quantification",
                     +        "Clinical Supplement",
                     +        "Single Cell Analysis",
                     +        "Masked Somatic Mutation",
                     +        "Slide Image"
                     +    )
                     -checkDataTypeInput <- function(legacy, data.type){
                     -    if(legacy){
                     -        legacy.data.type <- c("Copy number segmentation",
                     -                              "Raw intensities",
                     -                              "Aligned reads",
                     -                              "Copy number estimate",
                     -                              "Simple nucleotide variation",
                     -                              "Gene expression quantification",
                     -                              "Coverage WIG",
                     -                              "miRNA gene quantification",
                     -                              "Genotypes",
                     -                              "miRNA isoform quantification",
                     -                              "Normalized copy numbers",
                     -                              "Isoform expression quantification",
                     -                              "Normalized intensities",
                     -                              "Tissue slide image",
                     -                              "Exon quantification",
                     -                              "Exon junction quantification",
                     -                              "Methylation beta value",
                     -                              "Unaligned reads",
                     -                              "Diagnostic image",
                     -                              "CGH array QC",
                     -                              "Biospecimen Supplement",
                     -                              "Pathology report",
                     -                              "Clinical Supplement",
                     -                              "Intensities",
                     -                              "Protein expression quantification",
                     -                              "Microsatellite instability",
                     -                              "Structural variation",
                     -                              "Auxiliary test",
                     -                              "Copy number QC metrics",
                     -                              "Intensities Log2Ratio",
                     -                              "Methylation array QC metrics",
                     -                              "Clinical data",
                     -                              "Copy number variation",
                     -                              "ABI sequence trace",
                     -                              "Protein Expression Quantification",
                     -                              "Biospecimen data",
                     -                              "Simple somatic mutation",
                     -                              "Bisulfite sequence alignment",
                     -                              "Methylation percentage",
                     -                              "Sequencing tag",
                     -                              "Sequencing tag counts",
                     -                              "LOH")
                     -        if(!data.type %in% legacy.data.type) {
                     -            print(knitr::kable(as.data.frame(sort(legacy.data.type))))
                     -            stop("Please set a data.type argument from the column legacy.data.type above")
                     -        }
                     -    } else {
                     -        harmonized.data.type <- c(
                     -            "Aggregated Somatic Mutation",
                     -            "Aligned Reads",
                     -            "Gene Expression Quantification",
                     -            "Raw CGI Variant",
                     -            "Methylation Beta Value",
                     -            "Differential Gene Expression",
                     -            "Splice Junction Quantification",
                     -            "Protein Expression Quantification",
                     -            "Annotated Somatic Mutation",
                     -            "Raw Simple Somatic Mutation",
                     -            "Masked Somatic Mutation",
                     -            "Copy Number Segment",
                     -            "Masked Intensities",
                     -            "Allele-specific Copy Number Segment",
                     -            "Masked Copy Number Segment",
                     -            "Isoform Expression Quantification",
                     -            "miRNA Expression Quantification",
                     -            "Gene Level Copy Number",
                     -            "Biospecimen Supplement",
                     -            "Gene Level Copy Number Scores",
                     -            "Protein Expression Quantification",
                     -            "Clinical Supplement",
                     -            "Single Cell Analysis",
                     -            "Masked Somatic Mutation",
                     -            "Slide Image")
                     -        if(!data.type %in% harmonized.data.type) {
                     -            print(knitr::kable(as.data.frame(sort(harmonized.data.type))))
                     -            stop("Please set a data.type argument from the column harmonized.data.type above")
                     -        }
                     +    if (!data.type %in% harmonized.data.type) {
                     +        print(knitr::kable(as.data.frame(sort(harmonized.data.type))))
                     +        stop("Please set a data.type argument from the column harmonized.data.type above")
+                         }
+                     }
                     -checkDataCategoriesInput <- function(project,data.category, legacy = FALSE){
                     +checkDataCategoriesInput <- function(project,data.category){
+                    +
                          for(proj in project){
                     -        project.summary <- getProjectSummary(proj, legacy)
+                    +
                     +        project.summary <- getProjectSummary(proj)
                              if(missing(data.category)) {
                                  print(knitr::kable(project.summary$data_categories))
                                  stop("Please set a data.category argument from the column data_category above")
+                             }
+                    +
                              if(!(data.category %in% project.summary$data_categories$data_category)) {
                                  print(knitr::kable(project.summary$data_categories))
                                  stop("Please set a valid data.category argument from the column data_category above. We could not validade the data.category for project ", proj)
@@ -618,13 +563,10 @@ get.mutation <- function(
                          if(missing(genes)) stop("Argument genes is missing")
                          # Get mutation annotation file
                     -    library(maftools)
                     -    library(dplyr)
                          query <- GDCquery(
                              project = project,
                              data.category = "Simple Nucleotide Variation",
                              access = "open",
                     -        legacy = FALSE,
                              data.type = "Masked Somatic Mutation",
                              workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
+                         )
@@ -638,8 +580,9 @@ get.mutation <- function(
                              unlist(
                                  sapply(
                                      mutant_variant_classification,
                     -                function(x) grep(x,maf$Variant_Classification,
                     -                                 ignore.case = TRUE)
                     +                function(x) {
                     +                    grep(x,maf$Variant_Classification,ignore.case = TRUE)
                     +                }
+                                 )
+                             )
+                         )
@@ -648,8 +591,10 @@ get.mutation <- function(
                          mut <- NULL
                          for(i in genes) {
                              if(!i %in% maf$Hugo_Symbol) next
                     -        aux <- data.frame(patient = substr(unique(maf[i == maf$Hugo_Symbol,]$Tumor_Sample_Barcode),1,15),
                     -                          mut = TRUE)
                     +        aux <- data.frame(
                     +            patient = substr(unique(maf[i == maf$Hugo_Symbol,]$Tumor_Sample_Barcode),1,15),
                     +            mut = TRUE
                     +        )
                              colnames(aux)[2] <- paste0("mut_hg38_",i)
                              if(is.null(mut)) {
                                  mut <- aux
@@ -668,6 +613,7 @@ get.mutation <- function(
                          return(mut)
+                     }
+                    +
                      get.mut.gistc <- function(
                              project,
                              genes,
@@ -694,6 +640,7 @@ get.mut.gistc <- function(
                          } else if(is.null(mut) & !is.null(cnv)) {
                              return(cnv)
+                         }
+                    +
                          return(NULL)
+                     }
                      get.mut.gistc.information <- function(

R/prepare.R

History View file @ c8404f5f

@@ -91,7 +91,7 @@ GDCprepare <- function(
                              stop("To remove the files, please set save to TRUE. Otherwise, the data will be lost")
+                         }
                          # We save the files in project/source/data.category/data.type/file_id/file_name
                     -    source <- ifelse(query$legacy,"legacy","harmonized")
                     +    source <- "harmonized"
                          files <- file.path(
                              query$results[[1]]$project, source,
                              gsub(" ","_",query$results[[1]]$data_category),
@@ -174,8 +174,7 @@ GDCprepare <- function(
                                  files = files,
                                  cases = cases,
                                  summarizedExperiment = summarizedExperiment,
                     -            platform =  unique(query$results[[1]]$platform),
                     -            legacy = query$legacy
                     +            platform =  unique(query$results[[1]]$platform)
+                             )
                          }  else if (grepl("Raw intensities|Masked Intensities",query$data.type, ignore.case = TRUE)) {
                              # preparing IDAT files
@@ -183,8 +182,7 @@ GDCprepare <- function(
                                  files = files,
                                  barcode = cases,
                                  summarizedExperiment = summarizedExperiment,
                     -            platform =  unique(query$results[[1]]$platform),
                     -            legacy = query$legacy
                     +            platform =  unique(query$results[[1]]$platform)
+                             )
                          }  else if (grepl("Proteome Profiling",query$data.category,ignore.case = TRUE)) {
@@ -199,7 +197,7 @@ GDCprepare <- function(
                          }  else if (grepl("Simple Nucleotide Variation",query$data.category,ignore.case = TRUE)) {
                     -        if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE) | source == "legacy"){
                     +        if(grepl("Masked Somatic Mutation",query$results[[1]]$data_type[1],ignore.case = TRUE)){
                                  data <- readSimpleNucleotideVariationMaf(files)
+                             }
@@ -212,7 +210,7 @@ GDCprepare <- function(
                                      files = files,
                                      cases = cases,
                                      summarizedExperiment = summarizedExperiment,
                     -                genome = ifelse(query$legacy,"hg19","hg38"),
                     +                genome = "hg38",
                                      experimental.strategy = unique(query$results[[1]]$experimental_strategy)
+                                 )
@@ -221,7 +219,7 @@ GDCprepare <- function(
                                      files = files,
                                      cases = cases,
                                      summarizedExperiment = FALSE,
                     -                genome = ifelse(query$legacy,"hg19","hg38"),
                     +                genome = "hg38",
                                      experimental.strategy = unique(query$results[[1]]$experimental_strategy)
+                                 )
@@ -713,14 +711,13 @@ readIDATDNAmethylation <- function(
                              files,
                              barcode,
                              summarizedExperiment,
                     -        platform,
                     -        legacy
                     +        platform
                      ) {
                          check_package("sesame")
                          # Check if moved files would be moved outside of scope folder, if so, path doesn't change
                     -    moved.files <- sapply(files,USE.NAMES=FALSE,function(x){
                     +    moved.files <- sapply(files,USE.NAMES = FALSE,function(x){
                              if (grepl("Raw_intensities|Masked_Intensities",dirname(dirname(x)))) {
                                  return(file.path(dirname(dirname(x)), basename(x)))
+                             }
@@ -753,7 +750,7 @@ readIDATDNAmethylation <- function(
                              betas <- makeSEFromDNAMethylationMatrix(
                                  betas = betas,
                     -            genome = ifelse(legacy,"hg19","hg38"),
                     +            genome ="hg38",
                                  met.platform = platform
+                             )
                              colData(betas) <- DataFrame(colDataPrepare(colnames(betas)))
@@ -774,8 +771,7 @@ readDNAmethylation <- function(
                              files,
                              cases,
                              summarizedExperiment = TRUE,
                     -        platform,
                     -        legacy
                     +        platform
                      ){
                          if(length(platform) > 1){
@@ -847,7 +843,7 @@ readDNAmethylation <- function(
                                  df <- makeSEFromDNAMethylationMatrix(
                                      betas = df,
                     -                genome = ifelse(legacy,"hg19","hg38"),
                     +                genome = "hg38",
                                      met.platform = platform
+                                 )
+                             }
@@ -1056,31 +1052,37 @@ colDataPrepareTCGA <- function(barcode){
                          # For the moment this will work only for TCGA Data
                          # We should search what TARGET data means
                     -    code <- c('01','02','03','04','05','06','07','08','09','10','11',
                     -              '12','13','14','20','40','50','60','61')
                     -    shortLetterCode <- c("TP","TR","TB","TRBM","TAP","TM","TAM","THOC",
                     -                         "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB",
                     -                         "CELL","XP","XCL")
+                    -
                     -    definition <- c("Primary solid Tumor", # 01
                     -                    "Recurrent Solid Tumor", # 02
                     -                    "Primary Blood Derived Cancer - Peripheral Blood", # 03
                     -                    "Recurrent Blood Derived Cancer - Bone Marrow", # 04
                     -                    "Additional - New Primary", # 05
                     -                    "Metastatic", # 06
                     -                    "Additional Metastatic", # 07
                     -                    "Human Tumor Original Cells", # 08
                     -                    "Primary Blood Derived Cancer - Bone Marrow", # 09
                     -                    "Blood Derived Normal", # 10
                     -                    "Solid Tissue Normal",  # 11
                     -                    "Buccal Cell Normal",   # 12
                     -                    "EBV Immortalized Normal", # 13
                     -                    "Bone Marrow Normal", # 14
                     -                    "Control Analyte", # 20
                     -                    "Recurrent Blood Derived Cancer - Peripheral Blood", # 40
                     -                    "Cell Lines", # 50
                     -                    "Primary Xenograft Tissue", # 60
                     -                    "Cell Line Derived Xenograft Tissue") # 61
                     +    code <- c(
                     +        '01','02','03','04','05','06','07','08','09','10','11',
                     +        '12','13','14','20','40','50','60','61'
                     +    )
                     +    shortLetterCode <- c(
                     +        "TP","TR","TB","TRBM","TAP","TM","TAM","THOC",
                     +        "TBM","NB","NT","NBC","NEBV","NBM","CELLC","TRB",
                     +        "CELL","XP","XCL"
                     +    )
+                    +
                     +    definition <- c(
                     +        "Primary solid Tumor", # 01
                     +        "Recurrent Solid Tumor", # 02
                     +        "Primary Blood Derived Cancer - Peripheral Blood", # 03
                     +        "Recurrent Blood Derived Cancer - Bone Marrow", # 04
                     +        "Additional - New Primary", # 05
                     +        "Metastatic", # 06
                     +        "Additional Metastatic", # 07
                     +        "Human Tumor Original Cells", # 08
                     +        "Primary Blood Derived Cancer - Bone Marrow", # 09
                     +        "Blood Derived Normal", # 10
                     +        "Solid Tissue Normal",  # 11
                     +        "Buccal Cell Normal",   # 12
                     +        "EBV Immortalized Normal", # 13
                     +        "Bone Marrow Normal", # 14
                     +        "Control Analyte", # 20
                     +        "Recurrent Blood Derived Cancer - Peripheral Blood", # 40
                     +        "Cell Lines", # 50
                     +        "Primary Xenograft Tissue", # 60
                     +        "Cell Line Derived Xenograft Tissue"
                     +    ) # 61
                          aux <- DataFrame(code = code,shortLetterCode,definition)
                          # in case multiple equal barcode
@@ -1088,10 +1090,12 @@ colDataPrepareTCGA <- function(barcode){
                                          "-[:alnum:]{3}-[:alnum:]{3}-[:alnum:]{4}-[:alnum:]{2}")
                          samples <- str_match(barcode,regex)[,1]
                     -    ret <- DataFrame(barcode = barcode,
                     -                     patient = substr(barcode, 1, 12),
                     -                     sample = substr(barcode, 1, 16),
                     -                     code = substr(barcode, 14, 15))
                     +    ret <- DataFrame(
                     +        barcode = barcode,
                     +        patient = substr(barcode, 1, 12),
                     +        sample = substr(barcode, 1, 16),
                     +        code = substr(barcode, 14, 15)
                     +    )
                          ret <- merge(ret,aux, by = "code", sort = FALSE)
                          ret <- ret[match(barcode,ret$barcode),]
                          rownames(ret) <- gsub("\\.","-",make.names(ret$barcode,unique=TRUE))

R/query.R

History View file @ c8404f5f

@@ -3,7 +3,6 @@
                      #'   Uses GDC API to search for search, it searches for both controlled and
                      #'   open-access data.
                      #'   For GDC data arguments project, data.category, data.type and workflow.type should be used
                     -#'   For the legacy data arguments project, data.category, platform and/or file.extension should be used.
                      #'   Please, see the vignette for a table with the possibilities.
                      #' @param project A list of valid project (see list with TCGAbiolinks:::getGDCprojects()$project_id)]
                      #' \itemize{
@@ -75,33 +74,15 @@
                      #' \item{ Simple Nucleotide Variation }
                      #' \item{ Transcriptome Profiling }
                      #' }
                     -#' List for legacy archive
                     -#' \itemize{
                     -#' \item{ Biospecimen }
                     -#' \item{ Clinical }
                     -#' \item{ Copy number variation }
                     -#' \item{ DNA methylation }
                     -#' \item{ Gene expression }
                     -#' \item{ Protein expression }
                     -#' \item{ Raw microarray data }
                     -#' \item{ Raw sequencing data }
                     -#' \item{ Simple nucleotide variation }
                     -#' }
                      #' @param data.type A data type to filter the files to download
                      #' For the complete list please check the vignette.
                      #' @param sample.type A sample type to filter the files to download
                      #' @param barcode A list of barcodes to filter the files to download
                     -#' @param legacy Search in the legacy repository
                      #' @param data.format Data format filter ("VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML",
                      #' "TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX")
                     -#' @param file.type To be used in the legacy database for some platforms,
                     -#' to define which file types to be used.
                      #' @param workflow.type GDC workflow type
                     -#' @param experimental.strategy Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.
                     -#' Legacy:  WXS, RNA-Seq, miRNA-Seq, Genotyping Array,
                     -#' DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS,
                     -#' MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array,
                     -#' Total RNA-Seq, Capillary sequencing, Bisulfite-Seq
                     +#' @param experimental.strategy Filter to experimental strategy.
                     +#' Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.
                      #' @param access Filter by access type. Possible values: controlled, open
                      #' @param platform Example:
                      #' \tabular{ll}{
@@ -157,19 +138,6 @@
                      #'    data.type = "Masked Copy Number Segment",
                      #'    sample.type = c("Primary Tumor")
                      #' )
                     -#' query.met <- GDCquery(
                     -#'    project = c("TCGA-GBM","TCGA-LGG"),
                     -#'    legacy = TRUE,
                     -#'    data.category = "DNA methylation",
                     -#'    platform = "Illumina Human Methylation 450"
                     -#' )
                     -#' query <- GDCquery(
                     -#'    project = "TCGA-ACC",
                     -#'    data.category =  "Copy number variation",
                     -#'    legacy = TRUE,
                     -#'    file.type = "hg19.seg",
                     -#'    barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")
                     -#' )
                      #' }
                      #' @return A data frame with the results and the parameters used
                      #' @importFrom jsonlite fromJSON
@@ -183,7 +151,6 @@ GDCquery <- function(
                              data.category,
                              data.type,
                              workflow.type,
                     -        legacy = FALSE,
                              access,
                              platform,
                              file.type,
@@ -243,11 +210,11 @@ GDCquery <- function(
+                             }
                          })
                          print.header("GDCquery: Searching in GDC database","section")
                     -    message("Genome of reference: ",ifelse(legacy,"hg19","hg38"))
                     +    message("Genome of reference: hg38")
                          # Check arguments
                          checkProjectInput(project)
                     -    checkDataCategoriesInput(project, data.category, legacy)
                     -    if(!is.na(data.type)) checkDataTypeInput(legacy = legacy, data.type = data.type)
                     +    checkDataCategoriesInput(project, data.category)
                     +    if(!is.na(data.type)) checkDataTypeInput(data.type = data.type)
                          if(!any(is.na(sample.type))) checkBarcodeDefinition(sample.type)
                          results <- NULL
@@ -257,7 +224,6 @@ GDCquery <- function(
                                  project = proj,
                                  data.category = data.category,
                                  data.type = data.type,
                     -            legacy = legacy,
                                  workflow.type = workflow.type,
                                  platform = platform,
                                  file.type = file.type,
@@ -279,7 +245,6 @@ GDCquery <- function(
                                      project = proj,
                                      data.category = data.category,
                                      data.type = data.type,
                     -                legacy = legacy,
                                      workflow.type = NA,
                                      platform = NA,
                                      file.type = file.type,
@@ -621,17 +586,6 @@ GDCquery <- function(
                              message("ooo By sample.type")
                              results <- results[tolower(results$sample_type) %in% tolower(sample.type),]
+                         }
                     -    # some how there are duplicated files in GDC we should remove them
                     -    # Example of problematic query
                     -    # query.exp <- GDCquery(project = "TCGA-BRCA",
                     -    #                  legacy = TRUE,
                     -    #                  data.category = "Gene expression",
                     -    #                  data.type = "Gene expression quantification",
                     -    #                  platform = "Illumina HiSeq",
                     -    #                  file.type = "results",
                     -    #                  experimental_strategy = "RNA-Seq",
                     -    #                  sample.type = c("Primary solid Tumor","Solid Tissue Normal"))
                     -    #
                          print.header("Checking data","subsection")
                          message("ooo Checking if there are duplicated cases")
@@ -665,7 +619,6 @@ GDCquery <- function(
                              project = I(list(project)),
                              data.category = data.category,
                              data.type = data.type,
                     -        legacy = legacy,
                              access = I(list(access)),
                              experimental.strategy =  I(list(experimental.strategy)),
                              file.type = file.type,
@@ -677,37 +630,41 @@ GDCquery <- function(
                          return(ret)
+                     }
                     -getGDCquery <- function(project, data.category, data.type, legacy, workflow.type,platform,file.type,files.access,sample.type,experimental.strategy){
                     +getGDCquery <- function(
                     +        project,
                     +        data.category,
                     +        data.type,
                     +        workflow.type,
                     +        platform,
                     +        file.type,
                     +        files.access,
                     +        sample.type,
                     +        experimental.strategy
                     +){
                          # Get manifest using the API
                     -    baseURL <- ifelse(legacy,"https://api.gdc.cancer.gov/legacy/files/?","https://api.gdc.cancer.gov/files/?")
                     +    baseURL <- "https://api.gdc.cancer.gov/files/?"
                          options.pretty <- "pretty=true"
                     -    if(data.category == "Protein expression" & legacy) {
                     -        options.expand <- "fields=archive.revision,archive.file_name,md5sum,state,data_category,file_id,platform,file_name,file_size,md5sum,submitter_id,data_type&expand=cases.samples.portions,cases.project,center,analysis"
                     -    } else if(data.category %in% c("Clinical","Biospecimen")) {
                     +    if(data.category %in% c("Clinical","Biospecimen")) {
                              options.expand <- "expand=cases,cases.project,center,analysis"
                          } else {
                              options.expand <- "expand=cases,cases.samples.portions.analytes.aliquots,cases.project,center,analysis,cases.samples"
+                         }
                     -    option.size <- paste0("size=",getNbFiles(project,data.category,legacy))
                     +    option.size <- paste0("size=",getNbFiles(project,data.category))
                          option.format <- paste0("format=JSON")
                     -    options.filter <- paste0("filters=",
                     -                             URLencode('{"op":"and","content":['),  # Start json request
                     -                             URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'),
                     -                             project,
                     -                             URLencode('"]}}'))
                     +    options.filter <- paste0(
                     +        "filters=",
                     +        URLencode('{"op":"and","content":['),  # Start json request
                     +        URLencode('{"op":"in","content":{"field":"cases.project.project_id","value":["'),
                     +        project,
                     +        URLencode('"]}}')
                     +    )
                     -    if(!is.na(experimental.strategy))  options.filter <- paste0(options.filter,addFilter("files.experimental_strategy", experimental.strategy))
                     +    if(!is.na(experimental.strategy)) options.filter <- paste0(options.filter,addFilter("files.experimental_strategy", experimental.strategy))
                          if(!is.na(data.category))  options.filter <- paste0(options.filter,addFilter("files.data_category", data.category))
                          if(!is.na(data.type))  options.filter <- paste0(options.filter,addFilter("files.data_type", data.type))
                          if(!is.na(workflow.type))  options.filter <- paste0(options.filter,addFilter("files.analysis.workflow_type", workflow.type))
                          if(!any(is.na(platform))) options.filter <- paste0(options.filter,addFilter("files.platform", platform))
                     -    if(!any(is.na(file.type))) {
                     -        if(file.type == "results" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "unnormalized"))
                     -        if(file.type == "normalized_results" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "normalized"))
                     -        if(file.type == "nocnv_hg19.seg" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "nocnv"))
                     -        if(file.type == "hg19.isoform" & legacy) options.filter <- paste0(options.filter,addFilter("files.tags", "hg19"))
                     -    }
                          if(!any(is.na(files.access))) {
                              options.filter <- paste0(options.filter,addFilter("files.access", files.access))
+                         }
@@ -1028,12 +985,11 @@ GDCquery_ATAC_seq <- function(
                          results$data_category <- "ATAC-seq"
                          results$project <- "ATAC-seq"
                          ret <- data.frame(
                     -        results=I(list(results)),
                     +        results = I(list(results)),
                              tumor = I(list(tumor)),
                              project = I(list("ATAC-seq")),
                              data.type = I(list("ATAC-seq")),
                     -        data.category = I(list("ATAC-seq")),
                     -        legacy = I(list(FALSE))
                     +        data.category = I(list("ATAC-seq"))
+                         )
                          return(ret)

R/visualize.R

History View file @ c8404f5f

@@ -871,7 +871,6 @@ unlistlabels <- function(lab) {
                      #' @importFrom data.table dcast setDT setDF :=
                      #' @examples
                      #' \dontrun{
                     -#' library(maftools)
                      #' library(dplyr)
                      #' query <- GDCquery(
                      #'    project = "TCGA-CHOL",
@@ -929,7 +928,6 @@ TCGAvisualize_oncoprint <- function(
                              annotation.legend.side = "bottom"
                      ){
+                    -
                          check_package("ComplexHeatmap")
                          check_package("circlize")
                          check_package("grid")

man/GDCdownload.Rd

History View file @ c8404f5f

@@ -34,15 +34,6 @@ Uses GDC API or GDC transfer tool to download gdc data
                        The data from query will be save in a folder: project/data.category
+                     }
                      \examples{
                     -query <- GDCquery(
                     -  project = "TCGA-ACC",
                     -  data.category =  "Copy number variation",
                     -  legacy = TRUE,
                     -  file.type = "hg19.seg",
                     -  barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01")
                     - )
                     -# data will be saved in  GDCdata/TCGA-ACC/legacy/Copy_number_variation/Copy_number_segmentation
                     -GDCdownload(query, method = "api")
                      \dontrun{
                          # Download clinical data from XML
                          query <- GDCquery(project = "TCGA-COAD", data.category = "Clinical")
@@ -57,14 +48,14 @@ GDCdownload(query, method = "api")
                          # data will be saved in:
                          # example_data_dir/TARGET-AML/harmonized/Transcriptome_Profiling/miRNA_Expression_Quantification
                          GDCdownload(query, method = "client", directory = "example_data_dir")
                     -    acc.gbm <- GDCquery(
                     +    query_acc_gbm <- GDCquery(
                              project =  c("TCGA-ACC","TCGA-GBM"),
                              data.category = "Transcriptome Profiling",
                              data.type = "Gene Expression Quantification",
                              workflow.type = "STAR - Counts"
+                         )
                          GDCdownload(
                     -       query = acc.gbm,
                     +       query = query_acc_gbm,
                             method = "api",
                             directory = "example",
                             files.per.chunk = 50

man/GDCquery.Rd

History View file @ c8404f5f

@@ -9,7 +9,6 @@ GDCquery(
                        data.category,
                        data.type,
                        workflow.type,
                     -  legacy = FALSE,
                        access,
                        platform,
                        file.type,
@@ -90,18 +89,6 @@ List for harmonized database:
                      \item{ Sequencing Reads }
                      \item{ Simple Nucleotide Variation }
                      \item{ Transcriptome Profiling }
                     -}
                     -List for legacy archive
                     -\itemize{
                     -\item{ Biospecimen }
                     -\item{ Clinical }
                     -\item{ Copy number variation }
                     -\item{ DNA methylation }
                     -\item{ Gene expression }
                     -\item{ Protein expression }
                     -\item{ Raw microarray data }
                     -\item{ Raw sequencing data }
                     -\item{ Simple nucleotide variation }
                      }}
                      \item{data.type}{A data type to filter the files to download
@@ -109,8 +96,6 @@ For the complete list please check the vignette.}
                      \item{workflow.type}{GDC workflow type}
                     -\item{legacy}{Search in the legacy repository}
+                    -
                      \item{access}{Filter by access type. Possible values: controlled, open}
                      \item{platform}{Example:
@@ -140,19 +125,13 @@ HumanMethylation27                \tab Mixed_DNASeq_Cont_curated      \cr
                      IlluminaHiSeq_RNASeqV2            \tab Mixed_DNASeq_Cont
                      }}
                     -\item{file.type}{To be used in the legacy database for some platforms,
                     -to define which file types to be used.}
+                    -
                      \item{barcode}{A list of barcodes to filter the files to download}
                      \item{data.format}{Data format filter ("VCF", "TXT", "BAM","SVS","BCR XML","BCR SSF XML",
                      "TSV", "BCR Auxiliary XML", "BCR OMF XML", "BCR Biotab", "MAF", "BCR PPS XML", "XLSX")}
                     -\item{experimental.strategy}{Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.
                     -Legacy:  WXS, RNA-Seq, miRNA-Seq, Genotyping Array,
                     -DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS,
                     -MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array,
                     -Total RNA-Seq, Capillary sequencing, Bisulfite-Seq}
                     +\item{experimental.strategy}{Filter to experimental strategy.
                     +Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array.}
                      \item{sample.type}{A sample type to filter the files to download}
+                     }
@@ -163,7 +142,6 @@ A data frame with the results and the parameters used
                      Uses GDC API to search for search, it searches for both controlled and
                        open-access data.
                        For GDC data arguments project, data.category, data.type and workflow.type should be used
                     -  For the legacy data arguments project, data.category, platform and/or file.extension should be used.
                        Please, see the vignette for a table with the possibilities.
+                     }
                      \examples{
@@ -193,19 +171,6 @@ query <- GDCquery(
                         data.type = "Masked Copy Number Segment",
                         sample.type = c("Primary Tumor")
+                     )
                     -query.met <- GDCquery(
                     -   project = c("TCGA-GBM","TCGA-LGG"),
                     -   legacy = TRUE,
                     -   data.category = "DNA methylation",
                     -   platform = "Illumina Human Methylation 450"
                     -)
                     -query <- GDCquery(
                     -   project = "TCGA-ACC",
                     -   data.category =  "Copy number variation",
                     -   legacy = TRUE,
                     -   file.type = "hg19.seg",
                     -   barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01")
                     -)
+                     }
+                     }
                      \author{

man/TCGAvisualize_oncoprint.Rd

History View file @ c8404f5f

@@ -87,7 +87,6 @@ Creating a oncoprint
+                     }
                      \examples{
                      \dontrun{
                     -library(maftools)
                      library(dplyr)
                      query <- GDCquery(
                         project = "TCGA-CHOL",

tests/testthat/test-prepare-download.R

History View file @ c8404f5f

@@ -1,17 +1,16 @@
                     -context("Download AND PREPARE")
+                    -
+                    -
                     +context("Download and prepare")
                      test_that("GDCdownload API method is working ", {
                          skip_on_bioc()
                          skip_if_offline()
                     -    cases <-  c(
                     +    cases <- c(
                              "TCGA-PA-A5YG-01A-11R-A29S-07",
                              "TCGA-OR-A5JX-01A-11R-A29S-07",
                              "TCGA-PK-A5HA-01A-11R-A29S-07",
                              "TCGA-OR-A5KY-01A-11R-A29S-07"
+                         )
+                    +
                          acc <- GDCquery(
                              project =  c("TCGA-ACC"),
                              data.category = "Transcriptome Profiling",
@@ -20,8 +19,8 @@ test_that("GDCdownload API method is working ", {
                              barcode = substr(cases,1,12)
+                         )
                          GDCdownload(acc, method = "api", directory = "ex")
+                    -
                          obj <- GDCprepare(acc,  directory = "ex",summarizedExperiment = TRUE)
+                    +
                          expect_true(all(substr(colnames(obj),1,12) == substr(cases,1,12)))
                          expect_true(all(obj$barcode == cases))
@@ -46,9 +45,6 @@ test_that("GDCdownload API method is working ", {
                          expect_true(all(query$results[[1]]$sample.submitter_id == data$sample_submitter_id))
                      })
+                    -
+                    -
+                    -
                      test_that("getBarcodeInfo works", {
                          skip_on_bioc()
                          skip_if_offline()
@@ -61,11 +57,14 @@ test_that("getBarcodeInfo works", {
                          x <- getBarcodeInfo(c("TARGET-20-PARUDL-03A"))
                          expect_true(all(cols %in% colnames(x)))
                     -    samples <- c("HCM-CSHL-0063-C18-85A",
                     -                 "HCM-CSHL-0065-C20-06A",
                     -                 "HCM-CSHL-0065-C20-85A",
                     -                 "HCM-CSHL-0063-C18-01A")
                     +    samples <- c(
                     +        "HCM-CSHL-0063-C18-85A",
                     +        "HCM-CSHL-0065-C20-06A",
                     +        "HCM-CSHL-0065-C20-85A",
                     +        "HCM-CSHL-0063-C18-01A"
                     +    )
                          x <- colDataPrepare(samples)
+                    +
                          expect_true(all(rownames(x) == samples))
                          expect_true(x[x$sample_submitter_id == "HCM-CSHL-0065-C20-06A","gender"] == "male")
                          expect_true(x[x$sample_submitter_id == "HCM-CSHL-0065-C20-06A","tumor_grade"] == "G2")
@@ -102,22 +101,29 @@ test_that("colDataPrepare handle replicates", {
                      test_that("GDCprepare accepts more than one project", {
                          skip_on_bioc()
                          skip_if_offline()
                     -    cases <-  c("TCGA-OR-A5JX-01A", "TCGA-OR-A5J3-01A",
                     -                "TCGA-06-0680-11A","TCGA-14-0871-01A")
                     +    cases <-  c(
                     +        "TCGA-OR-A5JX-01A",
                     +        "TCGA-OR-A5J3-01A",
                     +        "TCGA-06-0680-11A",
                     +        "TCGA-14-0871-01A"
                     +    )
                          expect_true(all(c("TCGA-ACC","TCGA-GBM") %in% colDataPrepare(cases)$project_id))
                     -    acc.gbm <- GDCquery(project =  c("TCGA-ACC","TCGA-GBM"),
                     -                        data.category = "Transcriptome Profiling",
                     -                        data.type = "Gene Expression Quantification",
                     -                        workflow.type = "STAR - Counts",
                     -                        barcode = substr(cases,1,12))
                     -    GDCdownload(acc.gbm, method = "api", directory = "ex")
                     -    obj <- GDCprepare(acc.gbm,  directory = "ex")
                     +    query_acc_gbm <- GDCquery(
                     +        project =  c("TCGA-ACC","TCGA-GBM"),
                     +        data.category = "Transcriptome Profiling",
                     +        data.type = "Gene Expression Quantification",
                     +        workflow.type = "STAR - Counts",
                     +        barcode = substr(cases, 1, 12)
                     +    )
                     +    GDCdownload(query_acc_gbm, method = "api", directory = "ex")
                     +    obj <- GDCprepare(query_acc_gbm,  directory = "ex")
                          expect_true(all(c("TCGA-ACC","TCGA-GBM") %in% SummarizedExperiment::colData(obj)$project_id))
                      })
                      test_that("Non TCGA data is processed", {
                          skip_on_bioc()
                          skip_if_offline()
+                    +
                          proj <- "MMRF-COMMPASS"
                          query <- GDCquery(
                              project = proj,
@@ -132,8 +138,6 @@ test_that("Non TCGA data is processed", {
                              workflow.type = "STAR - Counts",
                              barcode = getResults(query)$cases[1:4]
+                         )
                     -    #GDCdownload(query)
                     -    #data <- GDCprepare(query)
                      })
                      test_that("Gene Level Copy Number is being correctly prepare", {
@@ -151,7 +155,7 @@ test_that("Gene Level Copy Number is being correctly prepare", {
                          data <- GDCprepare(query,directory = "ex")
                          expect_true(all(substr(colnames(data),1,12) == c("TCGA-OR-A5JD","TCGA-OR-A5J7")))
                     -    unlink("ex",recursive = TRUE,force = TRUE)
                     +    unlink("ex", recursive = TRUE, force = TRUE)
                      })
                      test_that("DNAm files is processed correctly", {
@@ -170,28 +174,6 @@ test_that("DNAm files is processed correctly", {
                          expect_lt(abs(assay(data.hg38)["cg16739396","TCGA-E2-A158-01A-11D-A12E-05"] - 0.0688655418909783),10^-10)
                      })
                     -test_that("IDAT files is processed", {
                     -    skip_on_bioc()
                     -    skip_if_offline()
+                    -
                     -    proj <- "TCGA-LUAD"
                     -    query <- GDCquery(
                     -        project = proj,
                     -        data.category = "Raw microarray data",
                     -        data.type = "Raw intensities",
                     -        experimental.strategy = "Methylation array",
                     -        legacy = TRUE,
                     -        file.type = ".idat",
                     -        barcode = "TCGA-55-7724",
                     -        platform = "Illumina Human Methylation 450"
                     -    )
                     -    #tryCatch(GDCdownload(query, method = "api", files.per.chunk = 20),
                     -    #         error = function(e) GDCdownload(query, method = "client"))
                     -    #betas <- GDCprepare(query)
                     -    #expect_true(nrow(betas) == 485577)
                     -    #expect_true(ncol(betas) == 1)
                     -})
+                    -
                      test_that("Prepare samples without clinical data", {
                          skip_on_bioc()
                          skip_if_offline()
@@ -214,30 +196,10 @@ test_that("Prepare multiple samples from the same patient", {
                          expect_true("age_at_diagnosis" %in% colnames(x))
                      })
                     -test_that("Preparing HT_HG-U133A as SE works", {
                     -    skip_on_bioc()
                     -    skip_if_offline()
+                    -
                     -    query <- GDCquery(
                     -        project = "TCGA-GBM",
                     -        legacy = TRUE,
                     -        data.category = "Gene expression",
                     -        data.type = "Gene expression quantification",
                     -        platform = c("HT_HG-U133A")
                     -    )
                     -    query$results[[1]] <- query$results[[1]][1:2,]
                     -    GDCdownload(query, method = "api", files.per.chunk = 100)
                     -    se <- GDCprepare(query, summarizedExperiment = TRUE)
+                    -
                     -    expect_true(is(se,"SummarizedExperiment"))
                     -})
+                    -
+                    -
                      test_that("Preparing RRPA files with number of proteins works", {
                          skip_on_bioc()
                          skip_if_offline()
+                    -
                          query_rppa <- GDCquery(
                              project = c("TCGA-COAD"),
                              data.category = "Proteome Profiling",
@@ -249,9 +211,12 @@ test_that("Preparing RRPA files with number of proteins works", {
                          GDCdownload(query_rppa)
                     -    expect_message(object = {
                     -        data_rppa <- GDCprepare(query_rppa)
                     -    },regexp = "Some files have a  different number of proteins, we will introduce NA for the missing values")
                     +    expect_message(
                     +        object = {
                     +            data_rppa <- GDCprepare(query_rppa)
                     +        },
                     +        regexp = "Some files have a  different number of proteins, we will introduce NA for the missing values"
                     +    )
                          expect_true(is(data_rppa,"data.frame"))
                      })

tests/testthat/test-query-clinical.R

History View file @ c8404f5f

@@ -11,7 +11,7 @@ test_that("TCGAquery_SampleTypes returns the correct barcodes", {
                      test_that("GDCquery_clinic populates correctly the data", {
                          skip_on_bioc()
                     -    results <- GDCquery_clinic( "BEATAML1.0-COHORT")
                     +    results <- GDCquery_clinic(project = "BEATAML1.0-COHORT")
                          results.2028 <- results[results$submitter_id == "2028",]
                          expect_equal(results.2028$vital_status,"Alive")
                          expect_true(
@@ -27,7 +27,7 @@ test_that("GDCquery_clinic populates correctly the data", {
                          expect_equal(results.42$ethnicity,"not hispanic or latino")
                          expect_equal(as.integer(results.2028$age_at_diagnosis %>% as.numeric() / 365.25),56)
                     -    results <- GDCquery_clinic( "TCGA-LUAD")
                     +    results <- GDCquery_clinic(project = "TCGA-LUAD")
                          results.sample <- results[results$submitter_id == "TCGA-80-5608",]
                          expect_equal(results.sample$vital_status,"Alive")
                          expect_equal(results.sample$gender,"female")

tests/testthat/test-query.R

History View file @ c8404f5f

@@ -20,16 +20,19 @@ test_that("GDCquery accepts more than one project", {
                              data.category = "Copy Number Variation",
                              data.type = "Copy Number Segment"
+                         )
+                    +
                          gbm <- GDCquery(
                              project = "TCGA-GBM",
                              data.category = "Copy Number Variation",
                              data.type = "Copy Number Segment"
+                         )
+                    +
                          acc.gbm <- GDCquery(
                              project =  c("TCGA-ACC","TCGA-GBM"),
                              data.category = "Copy Number Variation",
                              data.type = "Copy Number Segment"
+                         )
+                    +
                          expect_equal(unique(acc.gbm$results[[1]]$data_type),"Copy Number Segment")
                          expect_equal(nrow(acc.gbm$results[[1]]), sum(nrow(acc$results[[1]]),nrow(gbm$results[[1]])))
                          expect_true(nrow(dplyr::anti_join(acc$results[[1]],acc.gbm$results[[1]], by = "file_id")) == 0)
@@ -51,34 +54,24 @@ test_that("GDCquery can filter by sample.type", {
                          expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type)
                          sample.type <- "Solid Tissue Normal"
                     -    query <- GDCquery(project = "TCGA-ACC",
                     -                      data.category =  "Copy Number Variation",
                     -                      data.type = "Masked Copy Number Segment",
                     -                      sample.type = sample.type)
                     -    expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type)
+                    -
                     -    sample.type <- "Solid Tissue Normal"
                     -    query <- GDCquery(project =  c("TCGA-COAD"),
                     -                      data.category = "Transcriptome Profiling",
                     -                      data.type = "Gene Expression Quantification",
                     -                      workflow.type = "STAR - Counts",
                     -                      sample.type = sample.type)
                     +    query <- GDCquery(
                     +        project = "TCGA-ACC",
                     +        data.category =  "Copy Number Variation",
                     +        data.type = "Masked Copy Number Segment",
                     +        sample.type = sample.type
                     +    )
                          expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type)
+                    -
                          sample.type <- "Solid Tissue Normal"
                     -    query <- GDCquery(project = "TCGA-BRCA",
                     -                      legacy = TRUE,
                     -                      data.category = "Gene expression",
                     -                      data.type = "Gene expression quantification",
                     -                      platform = "Illumina HiSeq",
                     -                      file.type = "results",
                     -                      experimental.strategy = "RNA-Seq",
                     -                      sample.type = sample.type)
                     +    query <- GDCquery(
                     +        project =  c("TCGA-COAD"),
                     +        data.category = "Transcriptome Profiling",
                     +        data.type = "Gene Expression Quantification",
                     +        workflow.type = "STAR - Counts",
                     +        sample.type = sample.type
                     +    )
                          expect_equal(as.character(unique(query$results[[1]]$sample_type)),sample.type)
+                    -
+                    -
                          sample.type <- c("Solid Tissue Normal", "Primary Tumor")
                          query <- GDCquery(
                              project = "TCGA-ACC",
@@ -121,56 +114,6 @@ test_that("GDCquery can filter by barcode", {
                          expect_true(!all(c("TCGA-3C-AALK","TCGA-A2-A04Q","TCGA-A4-A04Q") %in% query$results[[1]]$cases))
                      })
                     -test_that("GDCquery can filter copy number from legacy data by file type. Case: nocnv_hg18", {
                     -    skip_on_bioc()
                     -    skip_if_offline()
+                    -
                     -    query <- GDCquery(project = "TCGA-ACC",
                     -                      data.category =  "Copy number variation",
                     -                      legacy = TRUE,
                     -                      file.type = "nocnv_hg18.seg",
                     -                      barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01"))
                     -    expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.nocnv_hg18.seg.txt")
                     -})
+                    -
                     -test_that("GDCquery can filter copy number from legacy data by file type. Case: hg18", {
                     -    skip_on_bioc()
                     -    skip_if_offline()
+                    -
                     -    query <- GDCquery(project = "TCGA-ACC",
                     -                      data.category =  "Copy number variation",
                     -                      legacy = TRUE,
                     -                      file.type = "hg18.seg",
                     -                      barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01"))
                     -    expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.hg18.seg.txt")
                     -})
+                    -
                     -test_that("GDCquery can filter copy number from legacy data by file type. Case: hg19", {
                     -    skip_on_bioc()
                     -    skip_if_offline()
+                    -
                     -    query <- GDCquery(project = "TCGA-ACC",
                     -                      data.category =  "Copy number variation",
                     -                      legacy = TRUE,
                     -                      file.type = "hg19.seg",
                     -                      barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01"))
                     -    expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.hg19.seg.txt")
                     -})
+                    -
+                    -
                     -test_that("GDCquery can filter copy number from legacy data by file type. Case: nocnv_hg19", {
                     -    skip_on_bioc()
                     -    skip_if_offline()
+                    -
                     -    query <- GDCquery(project = "TCGA-ACC",
                     -                      data.category =  "Copy number variation",
                     -                      legacy = TRUE,
                     -                      file.type = "nocnv_hg19.seg",
                     -                      barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01"))
                     -    expect_equal(query$results[[1]]$file_name,"AQUAE_p_TCGA_112_304_b2_N_GenomeWideSNP_6_D10_1348300.nocnv_hg19.seg.txt")
+                    -
                     -})
+                    -
                      test_that("GDCquery can filter by access level", {
                          skip_on_bioc()
@@ -186,15 +129,12 @@ test_that("GDCquery can filter by access level", {
                          expect_equal(unique(query$results[[1]]$access),"controlled")
                      })
+                    -
+                    -
+                    -
                      test_that("getNbFiles and getNbCases works", {
                          skip_on_bioc()
                          skip_if_offline()
                          aux <- getProjectSummary("TCGA-LUAD",TRUE)
                     -    files <- getNbFiles("TCGA-LUAD","Raw microarray data",legacy = T)
                     +    files <- getNbFiles("TCGA-LUAD","Raw microarray data")
                          cases <- getNbCases("TCGA-LUAD","Raw microarray data")
                          expect_true(cases < files)
                      })

vignettes/download_prepare.Rmd

History View file @ c8404f5f

@@ -72,18 +72,8 @@ which defines the output type a Summarized Experiment (default option) or a data
                      To create a summarized Experiment object we annotate the data with genomic positions
                      with last patch release version of the genome available.
                     -For legacy data (data aligned to hg19) TCGAbiolinks is using GRCh37.p13 and for
                     -harmonized data (data aligned to hg38) now it is using Gencode version 36.
                     -Unfortunately, some of the updates changes/remove gene symbols, change coordinates, etc.
                     -Which might introduce some loss of data. For example, if the gene was removed we cannot map
                     -it anymore and that information will be lost in the `SummarizedExperiment`.
+                    -
                     -If you set `SummarizedExperiment` to `FALSE`, you will get the data unmodified
                     -just as they are in the files and ad your own annotation.
+                    -
                     -Also, there are no updated for DNA methylation data. But the last metadata available can be found
                     -here: [http://zwdzwd.github.io/InfiniumAnnotation](http://zwdzwd.github.io/InfiniumAnnotation)
                     +Also,  the latest DNA methylation metadata is available at: [http://zwdzwd.github.io/InfiniumAnnotation](http://zwdzwd.github.io/InfiniumAnnotation)
                      </div>
                      </div>
@@ -132,48 +122,6 @@ in `GDCprepare` and `GDCdownload`
                      | mut.pipeline 	| If add.gistic2.mut is not NULL this field will be taken in consideration. Four separate variant calling pipelines are implemented for GDC data harmonization. Options: muse, varscan2, somaticsniper, MuTect2. For more information: https://gdc-docs.nci.nih.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/ 	|
                      | mutant_variant_classification 	| List of mutant_variant_classification that will be consider a sample mutant or not. Default: "Frame_Shift_Del", "Frame_Shift_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "In_Frame_Del", "In_Frame_Ins", "Translation_Start_Site", "Nonstop_Mutation" 	|
                     -## Search and download data from legacy database using GDC api method
+                    -
                     -In this example we will download gene expression data from legacy database (data
                     -aligned against genome of reference hg19) using GDC api method and  we will show object data and metadata.
                     -```{r results = 'hide', message=FALSE, warning=FALSE, eval = F}
                     -query <- GDCquery(
                     -    project = "TCGA-GBM",
                     -    data.category = "Gene expression",
                     -    data.type = "Gene expression quantification",
                     -    platform = "Illumina HiSeq",
                     -    file.type  = "normalized_results",
                     -    experimental.strategy = "RNA-Seq",
                     -    barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"),
                     -    legacy = TRUE
                     -)
                     -GDCdownload(
                     -    query = query,
                     -    method = "api",
                     -    files.per.chunk = 10
                     -)
                     -data <- GDCprepare(query = query)
                     -```
+                    -
                     -```{r message=FALSE, warning=FALSE, include=FALSE}
                     -data <- gbm.exp.legacy
                     -```
+                    -
                     -```{r message=FALSE, warning=FALSE}
                     -# Gene expression aligned against hg19.
                     -datatable(
                     -    as.data.frame(colData(data)),
                     -    options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
                     -    rownames = FALSE)
                     -# Only first 20 rows to make render faster
                     -datatable(
                     -    assay(data)[1:20,],
                     -    options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
                     -    rownames = TRUE
                     -)
+                    -
                     -rowRanges(data)
                     -```
                      ## Search and download data for two samples from database
@@ -238,44 +186,6 @@ Examples of query, download, prepare can be found in this [gist](https://gist.gi
                      | Biospecimen                 | Biospecimen Supplement            |      |             |
                      | Clinical                    |                |       |                  |
                     -## Legacy data
                     -| Data.category               | Data.type                         | Platform                            | file.type          | Status          |
                     -|-----------------------------|-----------------------------------|-------------------------------------|--------------------|-----------------|
                     -| Transcriptome Profiling     |                                   |                                     |                    |                 |
                     -| Copy number variation       | -                                 | Affymetrix SNP Array 6.0            | nocnv_hg18.seg     | Working         |
                     -|                             | -                                 | Affymetrix SNP Array 6.0            | hg18.seg           | Working         |
                     -|                             | -                                 | Affymetrix SNP Array 6.0            | nocnv_hg19.seg     | Working         |
                     -|                             | -                                 | Affymetrix SNP Array 6.0            | hg19.seg           | Working         |
                     -|                             | -                                 | Illumina HiSeq                      | Several            | Working         |
                     -| Simple Nucleotide Variation | Simple somatic mutation           |                                     |                    |                 |
                     -| Raw Sequencing Data         |                                   |                                     |                    |                 |
                     -| Biospecimen                 |                                   |                                     |                    |                 |
                     -| Clinical                    |                                   |                                     |                    |                 |
                     -| Protein expression          |                                   | MDA RPPA Core                       | -                  | Working         |
                     -| Gene expression             | Gene expression quantification    | Illumina HiSeq                      | normalized_results | Working         |
                     -|                             |                                   | Illumina HiSeq                      | results            | Working         |
                     -|                             |                                   | HT_HG-U133A                         | -                  | Working         |
                     -|                             |                                   | AgilentG4502A_07_2                  | -                  | Data frame only |
                     -|                             |                                   | AgilentG4502A_07_1                  | -                  | Data frame only |
                     -|                             |                                   | HuEx-1_0-st-v2                      | FIRMA.txt          | Not Preparing   |
                     -|                             |                                   |                                     | gene.txt           | Not Preparing   |
                     -|                             | Isoform expression quantification |                                     |                    |                 |
                     -|                             | miRNA gene quantification         |                                     |                    |                 |
                     -|                             | Exon junction quantification      |                                     |                    |                 |
                     -|                             | Exon quantification               |                                     |                    |                 |
                     -|                             | miRNA isoform quantification      |                                     |                    |                 |
                     -|                             |                                   |                                     |                    |                 |
                     -| DNA methylation             |                                   | Illumina Human Methylation 450      | Not used           | Working         |
                     -|                             |                                   | Illumina Human Methylation 27       | Not used           | Working         |
                     -|                             |                                   | Illumina DNA Methylation OMA003 CPI | Not used           | Working         |
                     -|                             |                                   | Illumina DNA Methylation OMA002 CPI | Not used           | Working         |
                     -|                             |                                   | Illumina Hi Seq                     |                    | Not  working    |
                     -| Raw Microarray Data         |                                   |                                     |                    |                 |
                     -| Structural Rearrangement    |                                   |                                     |                    |                 |
                     -| Other                       |                                   |                                     |                    |                 |
+                    -
+                    -
+                    -
                      # Examples
@@ -444,8 +354,7 @@ query <- GDCquery(
                          project = "TCGA-BRCA",
                          data.category = "DNA Methylation",
                          data.type = "Masked Intensities",
                     -    platform = "Illumina Human Methylation 27",
                     -    legacy = FALSE
                     +    platform = "Illumina Human Methylation 27"
+                     )
                      GDCdownload(query, files.per.chunk=10)
                      betas <- GDCprepare(query)
@@ -454,10 +363,9 @@ query <- GDCquery(
                          project = "HCMI-CMDC",
                          data.category = "DNA Methylation",
                          data.type = "Masked Intensities",
                     -    platform = "Illumina Methylation Epic",
                     -    legacy = FALSE
                     +    platform = "Illumina Methylation Epic"
+                     )
                     -GDCdownload(query, files.per.chunk=10)
                     +GDCdownload(query, files.per.chunk = 10)
                      betas <- GDCprepare(query)
@@ -465,8 +373,7 @@ query <- GDCquery(
                          project = "CPTAC-3",
                          data.category = "DNA Methylation",
                          data.type = "Masked Intensities",
                     -    platform = "Illumina Methylation Epic",
                     -    legacy = FALSE
                     +    platform = "Illumina Methylation Epic"
+                     )
                      GDCdownload(query, files.per.chunk=10)
                      betas <- GDCprepare(query)
@@ -475,10 +382,9 @@ query <- GDCquery(
                          project = "TCGA-BRCA",
                          data.category = "DNA Methylation",
                          data.type = "Masked Intensities",
                     -    platform = "Illumina Methylation Epic",
                     -    legacy = FALSE
                     +    platform = "Illumina Methylation Epic"
+                     )
                     -GDCdownload(query, files.per.chunk=10)
                     +GDCdownload(query, files.per.chunk = 10)
                      betas <- GDCprepare(query)
@@ -571,7 +477,6 @@ https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeli
                      query.sc.analysis <- GDCquery(
                          project = "CPTAC-3",
                          data.category = "Transcriptome Profiling",
                     -    legacy = FALSE,
                          access = "open",
                          data.type = "Single Cell Analysis",
                          data.format =  "TSV"
@@ -584,7 +489,6 @@ Single.Cell.Analysis.list <- GDCprepare(query.sc.analysis)
                      query.hdF5 <- GDCquery(
                          project = "CPTAC-3",
                          data.category = "Transcriptome Profiling",
                     -    legacy = FALSE,
                          access = "open",
                          data.type = "Single Cell Analysis",
                          barcode = c("CPT0167860015","CPT0206880004"),
@@ -598,7 +502,6 @@ df.HDF5 <- GDCprepare(query.hdF5)
                      query.raw.counts <- GDCquery(
                          project = "CPTAC-3",
                          data.category = "Transcriptome Profiling",
                     -    legacy = FALSE,
                          access = "open",
                          data.type = "Gene Expression Quantification",
                          barcode = c("CPT0167860015","CPT0206880004"),
@@ -612,7 +515,6 @@ raw.counts.list <- GDCprepare(query.raw.counts)
                      query.filtered.counts <- GDCquery(
                          project = "CPTAC-3",
                          data.category = "Transcriptome Profiling",
                     -    legacy = FALSE,
                          access = "open",
                          data.type = "Gene Expression Quantification",
                          barcode = c("CPT0167860015","CPT0206880004"),
@@ -627,7 +529,6 @@ filtered.counts.list <- GDCprepare(query.filtered.counts)
                      query.sc.dea <- GDCquery(
                          project = "CPTAC-3",
                          data.category = "Transcriptome Profiling",
                     -    legacy = FALSE,
                          access = "open",
                          data.type = "Differential Gene Expression",
                          barcode = c("CPT0167860015","CPT0206880004"),
@@ -636,91 +537,3 @@ query.sc.dea <- GDCquery(
                      GDCdownload(query.sc.dea)
                      sc.dea.list <- GDCprepare(query.sc.dea)
                      ```
+                    -
                     -## Legacy archive: data aligned against hg19
+                    -
                     -### DNA methylation: Get all TCGA IDAT files
+                    -
                     -```{r message=FALSE, warning=FALSE, eval =FALSE}
                     -#-------------------------------------------------------
                     -# Example to idat files from TCGA projects
                     -#-------------------------------------------------------
                     -projects <- TCGAbiolinks:::getGDCprojects()$project_id
                     -projects <- projects[grepl('^TCGA',projects,perl=T)]
                     -match.file.cases.all <- NULL
                     -for(proj in projects){
                     -    print(proj)
                     -    query <- GDCquery(
                     -        project = proj,
                     -        data.category = "Raw microarray data",
                     -        data.type = "Raw intensities",
                     -        experimental.strategy = "Methylation array",
                     -        legacy = TRUE,
                     -        file.type = ".idat",
                     -        platform = "Illumina Human Methylation 450"
                     -    )
                     -    match.file.cases <- getResults(query,cols=c("cases","file_name"))
                     -    match.file.cases$project <- proj
                     -    match.file.cases.all <- rbind(match.file.cases.all,match.file.cases)
                     -    tryCatch(
                     -        GDCdownload(query, method = "api", files.per.chunk = 20),
                     -        error = function(e) GDCdownload(query, method = "client")
                     -    )
                     -}
                     -# This will create a map between idat file name, cases (barcode) and project
                     -readr::write_tsv(match.file.cases.all, path =  "idat_filename_case.txt")
                     -# code to move all files to local folder
                     -for(file in dir(".",pattern = ".idat", recursive = T)){
                     -    TCGAbiolinks::move(file,basename(file))
                     -}
                     -```
+                    -
+                    -
                     -### DNA methylation
+                    -
                     -```{r, eval = FALSE}
                     -query_meth.hg19 <- GDCquery(
                     -    project= "TCGA-LGG",
                     -    data.category = "DNA methylation",
                     -    platform = "Illumina Human Methylation 450",
                     -    barcode = c("TCGA-HT-8111-01A-11D-2399-05","TCGA-HT-A5R5-01A-11D-A28N-05"),
                     -    legacy = TRUE
                     -)
                     -GDCdownload(query_meth.hg19)
                     -data.hg19 <- GDCprepare(query_meth.hg19)
                     -```
+                    -
+                    -
                     -### Protein expression
                     -```{r, eval = FALSE}
                     -query <- GDCquery(
                     -    project = "TCGA-GBM",
                     -    data.category = "Protein expression",
                     -    legacy = TRUE,
                     -    barcode = c("TCGA-OX-A56R-01A-21-A44T-20","TCGA-08-0357-01A-21-1898-20")
                     -)
                     -GDCdownload(query)
                     -data <- GDCprepare(
                     -    query, save = TRUE,
                     -    save.filename = "gbmProteinExpression.rda",
                     -    remove.files.prepared = TRUE
                     -)
                     -```
+                    -
+                    -
                     -### Gene expression
                     -```{r, eval = FALSE}
                     -# Aligned against Hg19
                     -query.exp.hg19 <- GDCquery(
                     -    project = "TCGA-GBM",
                     -    data.category = "Gene expression",
                     -    data.type = "Gene expression quantification",
                     -    platform = "Illumina HiSeq",
                     -    file.type  = "normalized_results",
                     -    experimental.strategy = "RNA-Seq",
                     -    barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"),
                     -    legacy = TRUE
                     -)
                     -GDCdownload(query.exp.hg19)
                     -data <- GDCprepare(query.exp.hg19)
                     -```

vignettes/query.Rmd

History View file @ c8404f5f

@@ -18,8 +18,6 @@ knitr::opts_knit$set(progress = FALSE)
                      **TCGAbiolinks** has provided a few functions to search GDC database.
                     -This section starts by explaining the different GDC sources (Harmonized and Legacy Archive), followed by some examples
                     -how to access them.
                      ---
@@ -33,23 +31,6 @@ library(DT)
                      #  Useful information
                     -<div class="panel panel-info">
                     -<div class="panel-heading">Different sources: Legacy vs Harmonized</div>
                     -<div class="panel-body">
+                    -
+                    -
                     -There are two available sources to download GDC data using TCGAbiolinks:
+                    -
                     -- GDC Legacy Archive : provides access to an unmodified copy of data that was previously stored in
                     -[CGHub](https://cghub.ucsc.edu/) and in the TCGA Data Portal hosted by the TCGA Data Coordinating Center (DCC), in which uses
                     -as references GRCh37 (hg19) and GRCh36 (hg18).
                     -- GDC harmonized database: data available was harmonized against GRCh38 (hg38) using GDC Bioinformatics Pipelines
                     -which provides methods to the standardization of biospecimen and
                     -clinical data.
+                    -
                     -</div>
                     -</div>
+                    -
                      <div class="panel panel-info">
                      <div class="panel-heading">Understanding the barcode</div>
@@ -79,7 +60,6 @@ with the following arguments:
                      | data.category 	| A valid project (see list with TCGAbiolinks:::getProjectSummary(project)) 	|  	|
                      | data.type 	| A data type to filter the files to download 	|  	|
                      | workflow.type 	| GDC workflow type 	|  	|
                     -| legacy 	| Search in the legacy repository 	|  	|
                      | access 	| Filter by access type. Possible values: controlled, open 	|  	|
                      | platform 	| Example: 	|  	|
                      |  	| CGH- 1x1M_G4447A 	| IlluminaGA_RNASeqV2 	|
@@ -107,7 +87,7 @@ with the following arguments:
                      |  	| IlluminaHiSeq_RNASeqV2 	| Mixed_DNASeq_Cont 	|
                      | file.type 	| To be used in the legacy database for some platforms, to define which file types to be used. 	|  	|
                      | barcode 	| A list of barcodes to filter the files to download 	|  	|
                     -| experimental.strategy 	| Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. Legacy: WXS, RNA-Seq, miRNA-Seq, Genotyping Array, DNA-Seq, Methylation array, Protein expression array, WXS,CGH array, VALIDATION, Gene expression array,WGS, MSI-Mono-Dinucleotide Assay, miRNA expression array, Mixed strategies, AMPLICON, Exon array, Total RNA-Seq, Capillary sequencing, Bisulfite-Seq 	|  	|
                     +| experimental.strategy 	| Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. |  	|
                      | sample.type 	| A sample type to filter the files to download 	|  	|
@@ -138,7 +118,7 @@ datatable(
                      The other fields (data.category, data.type, workflow.type, platform, file.type) can be found below.
                      Please, note that these tables are still incomplete.
                     -## Harmonized data options (`legacy = FALSE`)
                     +## Harmonized data options
                      ```{r, echo=FALSE}
                      datatable(
@@ -149,21 +129,12 @@ datatable(
+                     )
                      ```
                     -## Legacy archive data  options (`legacy = TRUE`)
                     -```{r, echo=FALSE}
                     -datatable(
                     -    readr::read_csv("https://docs.google.com/spreadsheets/d/1f98kFdj9mxVDc1dv4xTZdx8iWgUiDYO-qiFJINvmTZs/export?format=csv&gid=1817673686",col_types = readr::cols()),
                     -    filter = 'top',
                     -    options = list(scrollX = TRUE, keys = TRUE, pageLength = 40),
                     -    rownames = FALSE
                     -)
                     -```
                      # Harmonized database examples
                      ## DNA methylation data: Recurrent tumor samples
                     -In this example we will access the harmonized database (`legacy = FALSE`)
                     +In this example we will access the harmonized database
                      and search for all DNA methylation data for recurrent glioblastoma multiform (GBM)
                      and low grade gliomas (LGG) samples.
@@ -172,7 +143,6 @@ and low grade gliomas (LGG) samples.
                      query <- GDCquery(
                          project = c("TCGA-GBM", "TCGA-LGG"),
                          data.category = "DNA Methylation",
                     -    legacy = FALSE,
                          platform = c("Illumina Human Methylation 450"),
                          sample.type = "Recurrent Tumor"
+                     )
@@ -186,19 +156,18 @@ datatable(
                      ## Samples with DNA methylation and gene expression data
                     -In this example we will access the harmonized database (`legacy = FALSE`)
                     +In this example we will access the harmonized database
                      and search for all patients with DNA methylation (platform HumanMethylation450k) and gene expression data
                      for Colon Adenocarcinoma tumor (TCGA-COAD).
                      ```{r message=FALSE, warning = FALSE, eval = FALSE}
                     -query.met <- GDCquery(
                     +query_met <- GDCquery(
                          project = "TCGA-COAD",
                          data.category = "DNA Methylation",
                     -    legacy = FALSE,
                          platform = c("Illumina Human Methylation 450")
+                     )
                     -query.exp <- GDCquery(
                     +query_exp <- GDCquery(
                          project = "TCGA-COAD",
                          data.category = "Transcriptome Profiling",
                          data.type = "Gene Expression Quantification",
@@ -207,20 +176,19 @@ query.exp <- GDCquery(
                      # Get all patients that have DNA methylation and gene expression.
                      common.patients <- intersect(
                     -    substr(getResults(query.met, cols = "cases"), 1, 12),
                     -    substr(getResults(query.exp, cols = "cases"), 1, 12)
                     +    substr(getResults(query_met, cols = "cases"), 1, 12),
                     +    substr(getResults(query_exp, cols = "cases"), 1, 12)
+                     )
                      # Only seelct the first 5 patients
                     -query.met <- GDCquery(
                     +query_met <- GDCquery(
                          project = "TCGA-COAD",
                          data.category = "DNA Methylation",
                     -    legacy = FALSE,
                          platform = c("Illumina Human Methylation 450"),
                          barcode = common.patients[1:5]
+                     )
                     -query.exp <- GDCquery(
                     +query_exp <- GDCquery(
                          project = "TCGA-COAD",
                          data.category = "Transcriptome Profiling",
                          data.type = "Gene Expression Quantification",
@@ -231,13 +199,13 @@ query.exp <- GDCquery(
                      ```{r results_matched, message=FALSE, warning=FALSE, eval = FALSE}
                      datatable(
                     -    getResults(query.met, cols = c("data_type","cases")),
                     +    getResults(query_met, cols = c("data_type","cases")),
                          filter = 'top',
                          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
                          rownames = FALSE
+                     )
                      datatable(
                     -    getResults(query.exp, cols = c("data_type","cases")),
                     +    getResults(query_exp, cols = c("data_type","cases")),
                          filter = 'top',
                          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
                          rownames = FALSE
@@ -327,98 +295,13 @@ datatable(
                      ```
                     -# Legacy archive examples
+                    -
                     -## DNA methylation
+                    -
                     -### Array-based assays
+                    -
                     -This example shows how the user can search for  glioblastoma multiform (GBM)
                     -and DNA methylation data
                     -for platform Illumina Human Methylation 450 and Illumina Human Methylation 27.
+                    -
                     -```{r message=FALSE, warning=FALSE}
                     -query <- GDCquery(
                     -    project = c("TCGA-GBM"),
                     -    legacy = TRUE,
                     -    data.category = "DNA methylation",
                     -    platform = c("Illumina Human Methylation 450", "Illumina Human Methylation 27")
                     -)
                     -datatable(
                     -    getResults(query, rows = 1:100),
                     -    filter = 'top',
                     -    options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
                     -    rownames = FALSE
                     -)
                     -```
+                    -
                     -### whole-genome bisulfite sequencing (WGBS)
+                    -
                     -```{r message = FALSE, warning = FALSE, eval = FALSE}
+                    -
                     -query <- GDCquery(
                     -    project = c("TCGA-LUAD"),
                     -    legacy = TRUE,
                     -    data.category = "DNA methylation",
                     -    data.type = "Methylation percentage",
                     -    experimental.strategy = "Bisulfite-Seq"
                     -)
+                    -
                     -# VCF - controlled data
                     -query <- GDCquery(
                     -    project = c("TCGA-LUAD"),
                     -    legacy = TRUE,
                     -    data.category = "DNA methylation",
                     -    data.type = "Bisulfite sequence alignment",
                     -    experimental.strategy = "Bisulfite-Seq"
                     -)
+                    -
+                    -
                     -# WGBS BAM files - controlled data
                     -query <- GDCquery(
                     -    project = c("TCGA-LUAD"),
                     -    legacy = TRUE,
                     -    data.type = "Aligned reads",
                     -    data.category = "Raw sequencing data",
                     -    experimental.strategy = "Bisulfite-Seq"
                     -)
                     -```
+                    -
+                    -
                     -## Gene expression
+                    -
                     -This exmaple shows how the user can search for  glioblastoma multiform (GBM)
                     -gene expression data with the normalized results for expression of a gene.
                     -For more information about file.types check [GDC TCGA file types](https://gdc.cancer.gov/resources-tcga-users/legacy-archive-tcga-tag-descriptions)
+                    -
                     -```{r message=FALSE, warning=FALSE}
                     -# Gene expression aligned against hg19.
                     -query.exp.hg19 <- GDCquery(
                     -    project = "TCGA-GBM",
                     -    data.category = "Gene expression",
                     -    data.type = "Gene expression quantification",
                     -    platform = "Illumina HiSeq",
                     -    file.type  = "normalized_results",
                     -    experimental.strategy = "RNA-Seq",
                     -    barcode = c("TCGA-14-0736-02A-01R-2005-01", "TCGA-06-0211-02A-02R-2005-01"),
                     -    legacy = TRUE
                     -)
+                    -
                     -datatable(
                     -    getResults(query.exp.hg19),
                     -    filter = 'top',
                     -    options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
                     -    rownames = FALSE
                     -)
                     -```
+                    -
                      # Get Manifest file
                      If you want to get the manifest file from the query object you can use the function *getManifest*. If you
                     -set save to TRUEm a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version [ddt-ui](https://github.com/NCI-GDC/dtt-ui) will be created.
                     +set save to `TRUE` a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version [ddt-ui](https://github.com/NCI-GDC/dtt-ui) will be created.
                      ```{r message=FALSE, warning=FALSE}
                     -getManifest(query.exp.hg19,save = FALSE)
                     +getManifest(query,save = FALSE)
                      ```
                      # ATAC-seq data
@@ -440,10 +323,10 @@ datatable(
                      You can use the function `GDCquery_ATAC_seq` filter the manifest table and use `GDCdownload` to save the data locally.
                      ```{r message=FALSE, warning=FALSE,eval = FALSE}
                      query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "rds")
                     -GDCdownload(query,method = "client")
                     +GDCdownload(query, method = "client")
                      query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "bigWigs")
                     -GDCdownload(query,method = "client")
                     +GDCdownload(query, method = "client")
                      ```