Merge branch 'devel' into RELEASE_3_17
# Conflicts:
# DESCRIPTION
... | ... |
@@ -51,20 +51,21 @@ gliomaClassifier <- function(data){ |
51 | 51 |
data(list = models, package = "TCGAbiolinksGUI.data",envir = env) |
52 | 52 |
|
53 | 53 |
for(i in models){ |
54 |
- model <- get(i,envir = env) |
|
54 |
+ message("------------------------------------------------------") |
|
55 |
+ message("Model: ",i) |
|
56 |
+ model <- get(i, envir = env) |
|
55 | 57 |
# If it is a Summarized Experiment object |
56 | 58 |
|
57 | 59 |
# keep only probes used in the model |
58 | 60 |
aux <- met[,colnames(met) %in% colnames(model$trainingData),drop = FALSE] |
59 |
- |
|
60 | 61 |
# This should not happen! |
61 | 62 |
if(any(apply(aux,2,function(x) all(is.na(x))))) { |
62 |
- print("NA columns") |
|
63 |
+ message("o Probes has NA value for all samples. Setting to 0.5 since model does not accept NA") |
|
63 | 64 |
aux[,apply(aux,2,function(x) all(is.na(x)))] <- 0.5 |
64 | 65 |
} |
65 | 66 |
|
66 | 67 |
if(any(apply(aux,2,function(x) any(is.na(x))))) { |
67 |
- print("NA values") |
|
68 |
+ message("o Probes has NA values for some samples. Setting values a the median of the sample since model does not accept NA ") |
|
68 | 69 |
colMedians <- colMedians(aux,na.rm = TRUE) |
69 | 70 |
x <- which(is.na(aux),arr.ind = TRUE) |
70 | 71 |
for(l in 1:nrow(x)){ |
... | ... |
@@ -72,6 +73,15 @@ gliomaClassifier <- function(data){ |
72 | 73 |
} |
73 | 74 |
} |
74 | 75 |
|
76 |
+ # For missing probes add values to 0.5 |
|
77 |
+ missing_probes <- setdiff(colnames(model$trainingData), colnames(met)) |
|
78 |
+ if(length(missing_probes) > 0) { |
|
79 |
+ message("o Probes are missing. Setting dummy probes to matrix with 0.5 value to all samples.") |
|
80 |
+ missing_probes_matrix <- matrix(rep(0.5, nrow(met) * length(missing_probes)),nrow = nrow(met)) |
|
81 |
+ colnames(missing_probes_matrix) <- missing_probes |
|
82 |
+ aux <- bind_cols(aux,missing_probes_matrix) |
|
83 |
+ } |
|
84 |
+ |
|
75 | 85 |
pred <- predict(model, aux) |
76 | 86 |
pred.prob <- predict(model, aux, type = "prob") |
77 | 87 |
colnames(pred.prob) <- paste0(i,"_", colnames(pred.prob)) |
... | ... |
@@ -296,7 +296,7 @@ GDCquery_clinic <- function( |
296 | 296 |
if("treatments" %in% colnames(df)){ |
297 | 297 |
treatments <- rbindlist(df$treatments,fill = TRUE) |
298 | 298 |
df$treatments <- NULL |
299 |
- treatments$submitter_id <- gsub("_treatment(_[0-9])?","", treatments$submitter_id) |
|
299 |
+ treatments$submitter_id <- gsub("_treatment(_[0-9])?|_treatment([0-9])?","", treatments$submitter_id) |
|
300 | 300 |
treatments <- treatments[,-c("updated_datetime", "state", "created_datetime")] |
301 | 301 |
|
302 | 302 |
# we have now two types of treatment |
... | ... |
@@ -402,7 +402,8 @@ GDCquery_clinic <- function( |
402 | 402 |
df$project <- project |
403 | 403 |
df <- df %>% dplyr::relocate(project) |
404 | 404 |
} |
405 |
- if(nrow(results) != nrow(df)){ |
|
405 |
+ |
|
406 |
+ if (nrow(results) != nrow(df)) { |
|
406 | 407 |
stop("Error: API returned more information") |
407 | 408 |
} |
408 | 409 |
|
... | ... |
@@ -139,7 +139,7 @@ GDCprepare <- function( |
139 | 139 |
} |
140 | 140 |
|
141 | 141 |
cases <- ifelse( |
142 |
- grepl("TCGA|TARGET|CGCI-HTMCP-CC",query$results[[1]]$project %>% unlist()), |
|
142 |
+ grepl("TCGA|TARGET|CGCI-HTMCP-CC|CPTAC-2",query$results[[1]]$project %>% unlist()), |
|
143 | 143 |
query$results[[1]]$cases, |
144 | 144 |
query$results[[1]]$sample.submitter_id |
145 | 145 |
) |
... | ... |
@@ -478,35 +478,21 @@ readmiRNAIsoformQuantification <- function (files, cases){ |
478 | 478 |
setDF(df) |
479 | 479 |
|
480 | 480 |
} |
481 |
+ |
|
481 | 482 |
readSimpleNucleotideVariationMaf <- function(files){ |
482 | 483 |
|
483 |
- ret <- plyr::adply(.data = files,.margins = 1,.fun = function(f){ |
|
484 |
- readr::read_tsv( |
|
485 |
- f, |
|
486 |
- comment = "#", |
|
487 |
- col_types = readr::cols( |
|
488 |
- Entrez_Gene_Id = col_integer(), |
|
489 |
- Start_Position = col_integer(), |
|
490 |
- End_Position = col_integer(), |
|
491 |
- t_depth = col_integer(), |
|
492 |
- t_ref_count = col_integer(), |
|
493 |
- t_alt_count = col_integer(), |
|
494 |
- n_depth = col_integer(), |
|
495 |
- TRANSCRIPT_STRAND = col_integer(), |
|
496 |
- PICK = col_integer(), |
|
497 |
- miRNA = col_character(), |
|
498 |
- TSL = col_integer(), |
|
499 |
- HGVS_OFFSET = col_integer() |
|
500 |
- ), |
|
501 |
- progress = TRUE |
|
502 |
- ) |
|
503 |
- }) |
|
504 |
- if(ncol(ret) == 1) { |
|
505 |
- ret <- plyr::adply(.data = files,.margins = 1,.fun = function(f){ |
|
506 |
- read_tsv( |
|
507 |
- f, |
|
484 |
+ ret <- files |> |
|
485 |
+ purrr::map_dfr(.f = function(x) { |
|
486 |
+ tab <- readr::read_tsv( |
|
487 |
+ x, |
|
488 |
+ show_col_types = FALSE, |
|
508 | 489 |
comment = "#", |
509 |
- col_types = cols( |
|
490 |
+ col_types = readr::cols( |
|
491 |
+ SOMATIC = col_character(), |
|
492 |
+ PUBMED = col_character(), |
|
493 |
+ miRNA = col_character(), |
|
494 |
+ HGVS_OFFSET = col_integer(), |
|
495 |
+ PHENO = col_character(), |
|
510 | 496 |
Entrez_Gene_Id = col_integer(), |
511 | 497 |
Start_Position = col_integer(), |
512 | 498 |
End_Position = col_integer(), |
... | ... |
@@ -514,16 +500,18 @@ readSimpleNucleotideVariationMaf <- function(files){ |
514 | 500 |
t_ref_count = col_integer(), |
515 | 501 |
t_alt_count = col_integer(), |
516 | 502 |
n_depth = col_integer(), |
517 |
- ALLELE_NUM = col_integer(), |
|
518 | 503 |
TRANSCRIPT_STRAND = col_integer(), |
519 | 504 |
PICK = col_integer(), |
520 | 505 |
TSL = col_integer(), |
521 |
- HGVS_OFFSET = col_integer(), |
|
522 |
- MINIMISED = col_integer()), |
|
523 |
- progress = TRUE |
|
524 |
- ) |
|
506 |
+ DISTANCE = col_integer() |
|
507 |
+ )) |
|
508 |
+ |
|
509 |
+ # empty MAF file |
|
510 |
+ # https://portal.gdc.cancer.gov/files/7917fcbe-cb66-447d-8ea8-3a324feee3fa |
|
511 |
+ if(nrow(tab) == 0) {return (NULL)} |
|
512 |
+ tab |
|
525 | 513 |
}) |
526 |
- } |
|
514 |
+ |
|
527 | 515 |
return(ret) |
528 | 516 |
} |
529 | 517 |
|
... | ... |
@@ -55,25 +55,25 @@ the same result as the paper. |
55 | 55 |
|
56 | 56 |
```{r, eval = FALSE, message = FALSE, results = "hide"} |
57 | 57 |
query <- GDCquery( |
58 |
- project = "TCGA-GBM", |
|
59 |
- data.category = "DNA methylation", |
|
60 |
- barcode = c("TCGA-06-0122","TCGA-14-1456"), |
|
61 |
- platform = "Illumina Human Methylation 27", |
|
62 |
- legacy = TRUE |
|
58 |
+ project = "TCGA-GBM", |
|
59 |
+ data.category = "DNA Methylation", |
|
60 |
+ barcode = c("TCGA-06-0122","TCGA-14-1456"), |
|
61 |
+ platform = "Illumina Human Methylation 27", |
|
62 |
+ data.type = "Methylation Beta Value" |
|
63 | 63 |
) |
64 | 64 |
GDCdownload(query) |
65 |
-data.hg19 <- GDCprepare(query) |
|
65 |
+dnam <- GDCprepare(query) |
|
66 | 66 |
``` |
67 | 67 |
|
68 | 68 |
```{r, eval = FALSE} |
69 |
-assay(data.hg19)[1:5,1:2] |
|
69 |
+assay(dnam)[1:5,1:2] |
|
70 | 70 |
``` |
71 | 71 |
|
72 | 72 |
## Function |
73 | 73 |
<hr> |
74 | 74 |
|
75 | 75 |
```{r, eval = FALSE} |
76 |
-classification <- gliomaClassifier(data.hg19) |
|
76 |
+classification <- gliomaClassifier(dnam) |
|
77 | 77 |
``` |
78 | 78 |
|
79 | 79 |
## Results |