Browse code

Implemented 'choose', 'average' and 'conf' reassignment modes in Telescope

beacalvo authored on 10/06/2022 14:01:21
Showing 4 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: atena
2 2
 Type: Package
3 3
 Title: Analysis of Transposable Elements
4
-Version: 1.3.0
4
+Version: 1.3.1
5 5
 Authors@R: c(
6 6
   person("Beatriz", "Calvo-Serra", email = "[email protected]", role = c("aut","cre")),
7 7
   person("Robert", "Castelo", email = "[email protected]", role = c("aut")))
... ...
@@ -202,7 +202,20 @@ setClass("ERVmapParam", contains="AtenaParam",
202 202
 #' iterations of the EM SQUAREM algorithm (Du and Varadhan, 2020). Default
203 203
 #' is 100 and this value is passed to the \code{maxiter} parameter of the
204 204
 #' \code{\link[SQUAREM]{squarem}()} function.
205
-#'
205
+#' 
206
+#' @slot reassign_mode (Default 'exclude') Character vector indicating
207
+#' reassignment mode after EM step. 
208
+#' Available methods are 'exclude' (reads with more than one best
209
+#' assignment are excluded from the final counts), 'choose' (when reads have
210
+#' more than one best assignment, one of them is randomly chosen), 'average'
211
+#' (the read count is divided evenly among the best assignments) and 'conf'
212
+#' (only assignments that exceed a certain threshold -defined by 
213
+#' \code{conf_prob} parameter- are accepted, then the read count is
214
+#' proportionally divided among the assignments above \code{conf_prob}).
215
+#' 
216
+#' @slot conf_prob (Default 0.9) Minimum probability for high confidence
217
+#' assignment.
218
+#' 
206 219
 #' @references
207 220
 #' Bendall et al. Telescope: characterization of the retrotranscriptome by
208 221
 #' accurate estimation of transposable element expression.
... ...
@@ -221,7 +234,9 @@ setClass("TelescopeParam", contains="AtenaParam",
221 234
                         pi_prior="integer",
222 235
                         theta_prior="integer",
223 236
                         em_epsilon="numeric",
224
-                        maxIter="integer"))
237
+                        maxIter="integer",
238
+                        reassign_mode="character",
239
+                        conf_prob="numeric"))
225 240
 
226 241
 #' TEtranscripts parameter class
227 242
 #'
... ...
@@ -76,6 +76,19 @@
76 76
 #' iterations of the EM SQUAREM algorithm (Du and Varadhan, 2020). Default
77 77
 #' is 100 and this value is passed to the \code{maxiter} parameter of the
78 78
 #' \code{\link[SQUAREM]{squarem}()} function.
79
+#' 
80
+#' @param reassign_mode (Default 'exclude') Character vector indicating
81
+#' reassignment mode after EM step. 
82
+#' Available methods are 'exclude' (reads with more than one best
83
+#' assignment are excluded from the final counts), 'choose' (when reads have
84
+#' more than one best assignment, one of them is randomly chosen), 'average'
85
+#' (the read count is divided evenly among the best assignments) and 'conf'
86
+#' (only assignments that exceed a certain threshold -defined by 
87
+#' \code{conf_prob} parameter- are accepted, then the read count is
88
+#' proportionally divided among the assignments above \code{conf_prob}).
89
+#' 
90
+#' @param conf_prob (Default 0.9) Minimum probability for high confidence
91
+#' assignment.
79 92
 #'
80 93
 #'
81 94
 #' @details
... ...
@@ -121,7 +134,9 @@ TelescopeParam <- function(bfl, teFeatures, aggregateby=character(0),
121 134
                             pi_prior=0L,
122 135
                             theta_prior=0L,
123 136
                             em_epsilon=1e-7,
124
-                            maxIter=100L) {
137
+                            maxIter=100L,
138
+                            reassign_mode="exclude",
139
+                            conf_prob=0.9) {
125 140
     bfl <- .checkBamFileListArgs(bfl, singleEnd, fragments)
126 141
     
127 142
     features <- .processFeatures(teFeatures, deparse(substitute(teFeatures)),
... ...
@@ -133,7 +148,8 @@ TelescopeParam <- function(bfl, teFeatures, aggregateby=character(0),
133 148
         strandMode=as.integer(strandMode), fragments=fragments,
134 149
         minOverlFract=minOverlFract, pi_prior=pi_prior,
135 150
         theta_prior=theta_prior, em_epsilon=em_epsilon,
136
-        maxIter=as.integer(maxIter))
151
+        maxIter=as.integer(maxIter), reassign_mode=reassign_mode,
152
+        conf_prob=conf_prob)
137 153
 }
138 154
 
139 155
 #' @param object A \linkS4class{TelescopeParam} object.
... ...
@@ -372,28 +388,61 @@ setMethod("qtex", "TelescopeParam",
372 388
     PiTS[PiTS < 0] <- 0 ## Pi estimates are sometimes negatively close to zero
373 389
     # --- end EM-step ---
374 390
     
375
-    # Implementation for reassign_mode=exclude
376 391
     Theta <- get("Theta", envir=Thetaenv)
377 392
     X <- .tsEstep(QmatTS, Theta, maskmulti, PiTS)
393
+    cntvec <- .reassign(X, tspar@reassign_mode, tspar@conf_prob, cntvec, tx_idx)
394
+    
395
+    names(cntvec) <- c(names(tspar@features), "no_feature")
396
+    nofeat <- cntvec["no_feature"]
397
+    cntvec <- .tssummarizeCounts(cntvec[-length(cntvec)], iste, tspar)
398
+    cntvec <- c(cntvec, nofeat)
399
+    cntvec
400
+}
401
+
402
+## private function .reassign()
403
+## Implements 4 different reassigning methods from Telescope (exclude, choose,
404
+## average and conf)
405
+#' @importFrom sparseMatrixStats rowSums2 colSums2
406
+.reassign <- function(X, reassign_mode, conf_prob, cntvec, tx_idx) {
407
+  
408
+  if (reassign_mode %in% c("exclude", "choose", "average")) {
378 409
     maxbyrow <- rowMaxs(X)
379
-    # Version 1
410
+    # Older versions
380 411
     # Xind <- X == maxbyrow
381
-    # Version 2
382 412
     # Xind <- (X / maxbyrow) == 1
383
-    
384 413
     Xind <- as(X, "lgCMatrix")
385 414
     Xind@x <- (X@x /maxbyrow[X@i+1]) == 1
386 415
     nmaxbyrow <- rowSums2(Xind)
387
-    
388
-    # Do not differenciate between TE and genes with istex because in Telescope
389
-    # genes are also included in the EMstep.
390 416
     cntvec[tx_idx] <- colSums2(Xind[nmaxbyrow == 1, ])
417
+  
418
+    if (reassign_mode == "choose") {
419
+      Xind2_s <- summary(Xind[nmaxbyrow > 1, ])
420
+      Xind2_s <- Xind2_s[Xind2_s$x == TRUE,]
421
+      Xind2_s <- Xind2_s[order(Xind2_s$i),]
422
+      bestov <- as.vector(table(Xind2_s$i))
423
+      selected_ov <- vapply(bestov, FUN = function(x) sample(x = x, size = 1), 
424
+                            FUN.VALUE = integer(1L))
425
+      ovcol <- table(Xind2_s[c(0,cumsum(bestov)[-length(bestov)]) + selected_ov,"j"])
426
+      whcol <- as.integer(names(ovcol))
427
+      cntvec[tx_idx][whcol] <- cntvec[tx_idx][whcol] + as.integer(ovcol)
428
+    }
391 429
     
392
-    names(cntvec) <- c(names(tspar@features), "no_feature")
393
-    nofeat <- cntvec["no_feature"]
394
-    cntvec <- .tssummarizeCounts(cntvec[-length(cntvec)], iste, tspar)
395
-    cntvec <- c(cntvec, nofeat)
396
-    cntvec
430
+    if (reassign_mode == "average") {
431
+      Xind2 <- Xind[nmaxbyrow > 1, ]
432
+      cntvec[tx_idx] <- cntvec[tx_idx] + colSums2(Xind2/rowSums2(Xind2))
433
+    }
434
+    
435
+  } else if (reassign_mode == "conf") {
436
+    whbelow <- which(X < conf_prob & X > 0, arr.ind = TRUE)
437
+    X[whbelow] <- 0
438
+    X_conf <- X[rowSums2(X) > 0,]
439
+    cntvec[tx_idx] <- colSums2(X_conf/rowSums2(X_conf))
440
+    
441
+  } else {
442
+    stop("'reassign_mode' should be one of 'exclude', 'choose', 'average' or 'conf'")
443
+  }
444
+  
445
+  cntvec
397 446
 }
398 447
 
399 448
 
... ...
@@ -20,7 +20,9 @@ TelescopeParam(
20 20
   pi_prior = 0L,
21 21
   theta_prior = 0L,
22 22
   em_epsilon = 1e-07,
23
-  maxIter = 100L
23
+  maxIter = 100L,
24
+  reassign_mode = "exclude",
25
+  conf_prob = 0.9
24 26
 )
25 27
 
26 28
 \S4method{show}{TelescopeParam}(object)
... ...
@@ -101,6 +103,19 @@ iterations of the EM SQUAREM algorithm (Du and Varadhan, 2020). Default
101 103
 is 100 and this value is passed to the \code{maxiter} parameter of the
102 104
 \code{\link[SQUAREM]{squarem}()} function.}
103 105
 
106
+\item{reassign_mode}{(Default 'exclude') Character vector indicating
107
+reassignment mode after EM step. 
108
+Available methods are 'exclude' (reads with more than one best
109
+assignment are excluded from the final counts), 'choose' (when reads have
110
+more than one best assignment, one of them is randomly chosen), 'average'
111
+(the read count is divided evenly among the best assignments) and 'conf'
112
+(only assignments that exceed a certain threshold -defined by 
113
+\code{conf_prob} parameter- are accepted, then the read count is
114
+proportionally divided among the assignments above \code{conf_prob}).}
115
+
116
+\item{conf_prob}{(Default 0.9) Minimum probability for high confidence
117
+assignment.}
118
+
104 119
 \item{object}{A \linkS4class{TelescopeParam} object.}
105 120
 }
106 121
 \value{
... ...
@@ -168,6 +183,19 @@ Algorithm Epsilon cutoff.}
168 183
 iterations of the EM SQUAREM algorithm (Du and Varadhan, 2020). Default
169 184
 is 100 and this value is passed to the \code{maxiter} parameter of the
170 185
 \code{\link[SQUAREM]{squarem}()} function.}
186
+
187
+\item{\code{reassign_mode}}{(Default 'exclude') Character vector indicating
188
+reassignment mode after EM step. 
189
+Available methods are 'exclude' (reads with more than one best
190
+assignment are excluded from the final counts), 'choose' (when reads have
191
+more than one best assignment, one of them is randomly chosen), 'average'
192
+(the read count is divided evenly among the best assignments) and 'conf'
193
+(only assignments that exceed a certain threshold -defined by 
194
+\code{conf_prob} parameter- are accepted, then the read count is
195
+proportionally divided among the assignments above \code{conf_prob}).}
196
+
197
+\item{\code{conf_prob}}{(Default 0.9) Minimum probability for high confidence
198
+assignment.}
171 199
 }}
172 200
 
173 201
 \examples{