Browse code

Update

Merge branch 'devel' into RELEASE_3_17

# Conflicts:
# DESCRIPTION

Tiago Silva authored on 05/10/2023 16:52:55
Showing 5 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: TCGAbiolinks
2 2
 Type: Package
3 3
 Title: TCGAbiolinks: An R/Bioconductor package for integrative analysis with GDC data
4
-Version: 2.28.3
4
+Version: 2.28.4
5 5
 Date: 2023-06-06
6 6
 Author: Antonio Colaprico,
7 7
     Tiago Chedraoui Silva,
... ...
@@ -51,20 +51,21 @@ gliomaClassifier <- function(data){
51 51
     data(list = models, package = "TCGAbiolinksGUI.data",envir = env)
52 52
 
53 53
     for(i in models){
54
-        model <- get(i,envir = env)
54
+        message("------------------------------------------------------")
55
+        message("Model: ",i)
56
+        model <- get(i, envir = env)
55 57
         # If it is a Summarized Experiment object
56 58
 
57 59
         # keep only probes used in the model
58 60
         aux <- met[,colnames(met) %in% colnames(model$trainingData),drop = FALSE]
59
-
60 61
         # This should not happen!
61 62
         if(any(apply(aux,2,function(x) all(is.na(x))))) {
62
-            print("NA columns")
63
+            message("o Probes has NA value for all samples. Setting to 0.5 since model does not accept NA")
63 64
             aux[,apply(aux,2,function(x) all(is.na(x)))] <- 0.5
64 65
         }
65 66
 
66 67
         if(any(apply(aux,2,function(x) any(is.na(x))))) {
67
-            print("NA values")
68
+            message("o Probes has NA values for some samples. Setting values a the median of the sample since model does not accept NA ")
68 69
             colMedians <- colMedians(aux,na.rm = TRUE)
69 70
             x <- which(is.na(aux),arr.ind = TRUE)
70 71
             for(l in 1:nrow(x)){
... ...
@@ -72,6 +73,15 @@ gliomaClassifier <- function(data){
72 73
             }
73 74
         }
74 75
 
76
+        # For missing probes add values to 0.5
77
+        missing_probes <- setdiff(colnames(model$trainingData), colnames(met))
78
+        if(length(missing_probes) > 0) {
79
+            message("o Probes are missing. Setting dummy probes to matrix with 0.5 value to all samples.")
80
+            missing_probes_matrix <- matrix(rep(0.5, nrow(met) * length(missing_probes)),nrow = nrow(met))
81
+            colnames(missing_probes_matrix) <- missing_probes
82
+            aux <- bind_cols(aux,missing_probes_matrix)
83
+        }
84
+
75 85
         pred <- predict(model, aux)
76 86
         pred.prob <- predict(model, aux, type = "prob")
77 87
         colnames(pred.prob) <- paste0(i,"_", colnames(pred.prob))
... ...
@@ -296,7 +296,7 @@ GDCquery_clinic <- function(
296 296
             if("treatments" %in% colnames(df)){
297 297
                 treatments <- rbindlist(df$treatments,fill = TRUE)
298 298
                 df$treatments <- NULL
299
-                treatments$submitter_id <- gsub("_treatment(_[0-9])?","", treatments$submitter_id)
299
+                treatments$submitter_id <- gsub("_treatment(_[0-9])?|_treatment([0-9])?","", treatments$submitter_id)
300 300
                 treatments <- treatments[,-c("updated_datetime", "state", "created_datetime")]
301 301
 
302 302
                 # we have now two types of treatment
... ...
@@ -402,7 +402,8 @@ GDCquery_clinic <- function(
402 402
             df$project <- project
403 403
             df <- df %>% dplyr::relocate(project)
404 404
         }
405
-        if(nrow(results) != nrow(df)){
405
+
406
+        if (nrow(results) != nrow(df)) {
406 407
             stop("Error: API returned more information")
407 408
         }
408 409
 
... ...
@@ -139,7 +139,7 @@ GDCprepare <- function(
139 139
     }
140 140
 
141 141
     cases <- ifelse(
142
-        grepl("TCGA|TARGET|CGCI-HTMCP-CC",query$results[[1]]$project %>% unlist()),
142
+        grepl("TCGA|TARGET|CGCI-HTMCP-CC|CPTAC-2",query$results[[1]]$project %>% unlist()),
143 143
         query$results[[1]]$cases,
144 144
         query$results[[1]]$sample.submitter_id
145 145
     )
... ...
@@ -478,35 +478,21 @@ readmiRNAIsoformQuantification <- function (files, cases){
478 478
     setDF(df)
479 479
 
480 480
 }
481
+
481 482
 readSimpleNucleotideVariationMaf <- function(files){
482 483
 
483
-    ret <- plyr::adply(.data = files,.margins = 1,.fun = function(f){
484
-        readr::read_tsv(
485
-            f,
486
-            comment = "#",
487
-            col_types = readr::cols(
488
-                Entrez_Gene_Id = col_integer(),
489
-                Start_Position = col_integer(),
490
-                End_Position = col_integer(),
491
-                t_depth = col_integer(),
492
-                t_ref_count = col_integer(),
493
-                t_alt_count = col_integer(),
494
-                n_depth = col_integer(),
495
-                TRANSCRIPT_STRAND = col_integer(),
496
-                PICK = col_integer(),
497
-                miRNA = col_character(),
498
-                TSL = col_integer(),
499
-                HGVS_OFFSET = col_integer()
500
-            ),
501
-            progress = TRUE
502
-        )
503
-    })
504
-    if(ncol(ret) == 1) {
505
-        ret <- plyr::adply(.data = files,.margins = 1,.fun = function(f){
506
-            read_tsv(
507
-                f,
484
+    ret <- files |>
485
+        purrr::map_dfr(.f = function(x) {
486
+            tab <- readr::read_tsv(
487
+                x,
488
+                show_col_types = FALSE,
508 489
                 comment = "#",
509
-                col_types = cols(
490
+                col_types = readr::cols(
491
+                    SOMATIC =  col_character(),
492
+                    PUBMED = col_character(),
493
+                    miRNA = col_character(),
494
+                    HGVS_OFFSET = col_integer(),
495
+                    PHENO = col_character(),
510 496
                     Entrez_Gene_Id = col_integer(),
511 497
                     Start_Position = col_integer(),
512 498
                     End_Position = col_integer(),
... ...
@@ -514,16 +500,18 @@ readSimpleNucleotideVariationMaf <- function(files){
514 500
                     t_ref_count = col_integer(),
515 501
                     t_alt_count = col_integer(),
516 502
                     n_depth = col_integer(),
517
-                    ALLELE_NUM = col_integer(),
518 503
                     TRANSCRIPT_STRAND = col_integer(),
519 504
                     PICK = col_integer(),
520 505
                     TSL = col_integer(),
521
-                    HGVS_OFFSET = col_integer(),
522
-                    MINIMISED = col_integer()),
523
-                progress = TRUE
524
-            )
506
+                    DISTANCE = col_integer()
507
+                ))
508
+
509
+            # empty MAF file
510
+            # https://portal.gdc.cancer.gov/files/7917fcbe-cb66-447d-8ea8-3a324feee3fa
511
+            if(nrow(tab) == 0) {return (NULL)}
512
+            tab
525 513
         })
526
-    }
514
+
527 515
     return(ret)
528 516
 }
529 517
 
... ...
@@ -55,25 +55,25 @@ the same result as the paper.
55 55
 
56 56
 ```{r, eval = FALSE, message = FALSE, results = "hide"}
57 57
 query <- GDCquery(
58
-  project = "TCGA-GBM",
59
-  data.category = "DNA methylation",
60
-  barcode = c("TCGA-06-0122","TCGA-14-1456"),
61
-  platform = "Illumina Human Methylation 27",
62
-  legacy = TRUE
58
+    project = "TCGA-GBM",
59
+    data.category = "DNA Methylation",
60
+    barcode = c("TCGA-06-0122","TCGA-14-1456"),
61
+    platform = "Illumina Human Methylation 27",
62
+    data.type = "Methylation Beta Value"
63 63
 )
64 64
 GDCdownload(query)
65
-data.hg19 <- GDCprepare(query)
65
+dnam <- GDCprepare(query)
66 66
 ```
67 67
 
68 68
 ```{r, eval = FALSE}
69
-assay(data.hg19)[1:5,1:2]
69
+assay(dnam)[1:5,1:2]
70 70
 ```
71 71
 
72 72
 ## Function
73 73
 <hr>
74 74
 
75 75
 ```{r, eval = FALSE}
76
-classification <- gliomaClassifier(data.hg19)
76
+classification <- gliomaClassifier(dnam)
77 77
 ```
78 78
 
79 79
 ## Results