Browse code

1) removing relocate function that is not released yet. 2) We had to change the logic for BEATAML1.0-COHORT. Still need to check how that would affect other cases #355

Former-commit-id: 08934f702c8dcff6e4574012757e1573a72cca89 [formerly 953900335b20e98368e9c51b963013b85c7daa19]
Former-commit-id: 618018322e5cf03660ef741bc13c8f2d958c2dd8

Tiago authored on 03/05/2020 17:19:06
Showing 5 changed files

... ...
@@ -120,7 +120,8 @@ importFrom(dplyr,funs)
120 120
 importFrom(dplyr,left_join)
121 121
 importFrom(dplyr,mutate_all)
122 122
 importFrom(dplyr,pull)
123
-importFrom(dplyr,relocate)
123
+importFrom(dplyr,row_number)
124
+importFrom(dplyr,slice)
124 125
 importFrom(edgeR,DGEList)
125 126
 importFrom(edgeR,estimateCommonDisp)
126 127
 importFrom(edgeR,estimateGLMCommonDisp)
... ...
@@ -183,7 +183,7 @@ getBarcodefromAliquot <- function(aliquot){
183 183
 #' (list of barcodes, aliquot ids, etc)
184 184
 #' @param step How many items to be evaluated per API call
185 185
 #' @param FUN function that calls the API
186
-splitAPICall <- function(FUN,step = 20,items){
186
+splitAPICall <- function(FUN, step = 20, items){
187 187
    info <- NULL
188 188
    info <- tryCatch({
189 189
         for(i in 0:(ceiling(length(items)/step) - 1)){
... ...
@@ -1283,7 +1283,8 @@ getAliquot_ids <- function(barcode){
1283 1283
 
1284 1284
 # getBarcodeInfo(c("TCGA-OR-A5K3-01A","C3N-00321-01"))
1285 1285
 # barcode is: sample_submitter_id
1286
-#' @importFrom dplyr bind_cols
1286
+#' @importFrom dplyr bind_cols slice row_number
1287
+#'
1287 1288
 getBarcodeInfo <- function(barcode) {
1288 1289
   baseURL <- "https://api.gdc.cancer.gov/cases/?"
1289 1290
   options.pretty <- "pretty=true"
... ...
@@ -1332,7 +1333,7 @@ getBarcodeInfo <- function(barcode) {
1332 1333
     tryCatch({
1333 1334
       samples$submitter_id <-
1334 1335
         str_extract_all(samples$submitter_id,
1335
-                        paste(submitter_id, collapse = "|"),
1336
+                        paste(c(submitter_id,barcode), collapse = "|"),
1336 1337
                         simplify = TRUE) %>% as.character
1337 1338
     }, error = function(e){
1338 1339
       samples$submitter_id <- submitter_id
... ...
@@ -1356,7 +1357,22 @@ getBarcodeInfo <- function(barcode) {
1356 1357
     # this is required since the sample might not have a diagnosis
1357 1358
     if(!any(df$submitter_id %in% diagnoses$submitter_id)){
1358 1359
       diagnoses$submitter_id <- NULL
1359
-      df <- dplyr::bind_cols(df,diagnoses)
1360
+      # The sample migth have different sample types
1361
+      # The diagnosis the same for each one of these samples
1362
+      # in that case we will have a 1 to mapping and binding will
1363
+      # not work. We need then to replicate diagnosis to each sample
1364
+      # and not each patient
1365
+      # Cases can be replicated with getBarcodeInfo(c("BA2691R","BA2577R","BA2748R"))
1366
+      if(nrow(diagnoses) <  nrow(df)){
1367
+        diagnoses <- plyr::ldply(
1368
+          1:length(results$submitter_sample_ids),
1369
+          .fun = function(x){
1370
+            diagnoses[x] %>% # replicate diagnoses the number of samples
1371
+              as.data.frame() %>%
1372
+              dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))})
1373
+      }
1374
+
1375
+      df <- dplyr::bind_cols(df %>% as.data.frame,diagnoses %>% as.data.frame)
1360 1376
     } else {
1361 1377
       df <- left_join(df, diagnoses, by = "submitter_id")
1362 1378
     }
... ...
@@ -1389,7 +1405,24 @@ getBarcodeInfo <- function(barcode) {
1389 1405
 
1390 1406
     if(!any(df$submitter_id %in% demographic$submitter_id)){
1391 1407
       demographic$submitter_id <- NULL
1392
-      df <- dplyr::bind_cols(df,demographic)
1408
+      demographic$updated_datetime  <- NULL
1409
+      demographic$created_datetime <- NULL
1410
+      # The sample migth have different sample types
1411
+      # The diagnosis the same for each one of these samples
1412
+      # in that case we will have a 1 to mapping and binding will
1413
+      # not work. We need then to replicate diagnosis to each sample
1414
+      # and not each patient
1415
+      # Cases can be replicated with getBarcodeInfo(c("BA2691R","BA2577R","BA2748R"))
1416
+      if(nrow(demographic) <  nrow(df)){
1417
+        demographic <- plyr::ldply(
1418
+          1:length(results$submitter_sample_ids),
1419
+          .fun = function(x){
1420
+            demographic[x,] %>% # replicate diagnoses the number of samples
1421
+              as.data.frame() %>%
1422
+              dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))})
1423
+      }
1424
+
1425
+      df <- dplyr::bind_cols(df  %>% as.data.frame,demographic)
1393 1426
     } else {
1394 1427
       df <- left_join(df,demographic, by = "submitter_id")
1395 1428
     }
... ...
@@ -1408,6 +1441,16 @@ getBarcodeInfo <- function(barcode) {
1408 1441
                       by = "submitter_id")
1409 1442
     })
1410 1443
   } else {
1444
+
1445
+    if(nrow(projects.info) <  nrow(df)){
1446
+      projects.info <- plyr::ldply(
1447
+        1:length(results$submitter_sample_ids),
1448
+        .fun = function(x){
1449
+          projects.info[x,] %>% # replicate diagnoses the number of samples
1450
+            as.data.frame() %>%
1451
+            dplyr::slice(rep(dplyr::row_number(), sum(results$submitter_sample_ids[[x]] %in% barcode)))})
1452
+    }
1453
+
1411 1454
     df <- dplyr::bind_cols(df,projects.info)
1412 1455
   }
1413 1456
 
... ...
@@ -163,7 +163,7 @@
163 163
 #' @importFrom  jsonlite fromJSON
164 164
 #' @importFrom knitr kable
165 165
 #' @importFrom httr timeout
166
-#' @importFrom dplyr pull relocate
166
+#' @importFrom dplyr pull
167 167
 GDCquery <- function(project,
168 168
                      data.category,
169 169
                      data.type,
... ...
@@ -600,17 +600,18 @@ GDCquery <- function(project,
600 600
     message("ooo Check if there results for the query")
601 601
     if(nrow(results) == 0) stop("Sorry, no results were found for this query")
602 602
 
603
-    # Try ordering
603
+    # Try ordering (needs dplyr 1.0 - still not published)
604 604
     results <- tryCatch({
605
-        results %>% relocate("project") %>%
606
-            relocate(contains("type"), .after = project) %>%
607
-            relocate(contains("category"), .after = project) %>%
608
-            relocate(contains("experimental_strategy"), .after = project) %>%
609
-            relocate(contains("submitter_id"), .after = project) %>%
610
-            relocate(contains("sample_type"), .before = experimental_strategy) %>%
611
-            relocate(access,.after = last_col())  %>%
612
-            relocate(starts_with("analysis"), .before = access) %>%
613
-            relocate(contains("datetime"),.after = last_col())
605
+        results
606
+    #    results %>% relocate("project") %>%
607
+    #        relocate(contains("type"), .after = project) %>%
608
+    #        relocate(contains("category"), .after = project) %>%
609
+    #        relocate(contains("experimental_strategy"), .after = project) %>%
610
+    #        relocate(contains("submitter_id"), .after = project) %>%
611
+    #        relocate(contains("sample_type"), .before = experimental_strategy) %>%
612
+    #        relocate(access,.after = last_col())  %>%
613
+    #        relocate(starts_with("analysis"), .before = access) %>%
614
+    #        relocate(contains("datetime"),.after = last_col())
614 615
     },error = function(e){
615 616
         results
616 617
     })
... ...
@@ -145,9 +145,18 @@ test_that("IDAT files is processed", {
145 145
  #    expect_true(ncol(betas) == 1)
146 146
 })
147 147
 
148
-test_that("Prepare Samples without clinical data", {
148
+test_that("Prepare samples without clinical data", {
149 149
     # x <-  GDCquery_clinic(project = "TCGA-LUAD", type = "clinical")
150 150
     # x[is.na(x$diagnosis_id),]
151 151
     x <- colDataPrepare(c("TCGA-80-5608-01A","TCGA-17-Z053-01A","TCGA-78-7158-01A"))
152 152
     expect_true(nrow(x) == 3)
153 153
 })
154
+
155
+test_that("Prepare multiple samples from the same patient", {
156
+    # https://portal.gdc.cancer.gov/cases/d7d3de82-802d-4664-8e42-d40408b129b0?bioId=548a300f-a7eb-4dc0-b9bc-5a643ef03d5d
157
+    x <- colDataPrepare(c("BA2691R","BA2577R","BA2748R","BA2577D"))
158
+    expect_true(nrow(x) == 4)
159
+    expect_equal(x["BA2748R","sample_type"],"Primary Blood Derived Cancer - Bone Marrow")
160
+    expect_equal(x["BA2577D","sample_type"],"Recurrent Blood Derived Cancer - Bone Marrow")
161
+    expect_true("age_at_diagnosis" %in% colnames(x))
162
+})