Browse code

Fixed ops code

fortinj2 authored on 24/06/2022 19:28:02
Showing 5 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: crisprDesign
2 2
 Title: Comprehensive design of CRISPR gRNAs for nucleases and base editors
3
-Version: 0.99.88
3
+Version: 0.99.89
4 4
 Authors@R: c(
5 5
     person("Jean-Philippe", "Fortin", email = "[email protected]", role = c("aut", "cre")),
6 6
     person("Luke", "Hoberecht", email = "[email protected]", role = c("aut"))
... ...
@@ -71,7 +71,7 @@ addOpsBarcodes <- function(guideSet,
71 71
 #'     diagonal distances be set to 0 to ignore self distances?
72 72
 #'     TRUE by default. 
73 73
 #' @param splitByChunks Should distances be calculated in a chunk-wise
74
-#'     manner? TRUE by default. Highly recommended when the set of query
74
+#'     manner? FALSE by default. Highly recommended when the set of query
75 75
 #'     barcodes is large to reduce memory footprint. 
76 76
 #' @param n_chunks Integer specifying the number of chunks to be used
77 77
 #'     when \code{splitByChunks=TRUE}. If NULL (default), number of chunks
... ...
@@ -118,7 +118,7 @@ getBarcodeDistanceMatrix <- function(queryBarcodes,
118 118
     if (is.null(min_dist_edit) & binnarize){
119 119
             stop("min_dist_edit must be specified when binnarize=TRUE.")
120 120
     }
121
-    if (!splitByChunks){
121
+    if (!splitByChunks | length(queryBarcodes<=200)){
122 122
         out <- .getChunkDistanceMatrix(queryBarcodes=queryBarcodes,
123 123
                                        targetBarcodes=targetBarcodes,
124 124
                                        min_dist_edit=min_dist_edit,
... ...
@@ -202,7 +202,10 @@ getBarcodeDistanceMatrix <- function(queryBarcodes,
202 202
 #'     have edit distances less than the min_dist_edit will not be
203 203
 #'     included in the library. 2 by default. 
204 204
 #' @param dist_method String specifying distance method. 
205
-#'     Must be either "hamming" (default) or "levenstein". 
205
+#'     Must be either "hamming" (default) or "levenstein".
206
+#' @param splitByChunks Should distances be calculated in a chunk-wise
207
+#'     manner? FALSE by default. Highly recommended when the set of query
208
+#'     barcodes is large to reduce memory footprint. 
206 209
 #' 
207 210
 #' @return A subset of the \code{df} containing the gRNAs
208 211
 #'     selected for the OPS library. 
... ...
@@ -229,7 +232,8 @@ designOpsLibrary <- function(df,
229 232
                              n_guides=4,
230 233
                              gene_field="gene",
231 234
                              min_dist_edit=2,
232
-                             dist_method=c("hamming","levenstein")
235
+                             dist_method=c("hamming","levenstein"),
236
+                             splitByChunks=FALSE
233 237
 ){
234 238
     dist_method <- match.arg(dist_method)
235 239
     df <- .validateOpsGrnaInput(df, gene_field)
... ...
@@ -243,12 +247,14 @@ designOpsLibrary <- function(df,
243 247
                      genes=genes)
244 248
     grnaList <- .initiateOpsLibrary(grnaList,
245 249
                                     dist_method=dist_method,
246
-                                    min_dist_edit=min_dist_edit)
250
+                                    min_dist_edit=min_dist_edit,
251
+                                    splitByChunks=splitByChunks)
247 252
     grnaList <- .updateOpsLibrary(grnaList,
248 253
                                   gene_field=gene_field,
249 254
                                   n_guides=n_guides,
250 255
                                   dist_method=dist_method,
251
-                                  min_dist_edit=min_dist_edit)
256
+                                  min_dist_edit=min_dist_edit,
257
+                                  splitByChunks=splitByChunks)
252 258
     out <- .getFinalOpsLibrary(grnaList)
253 259
     out <- out[order(out[[gene_field]], out[["rank"]]),,drop=FALSE]
254 260
     return(out)
... ...
@@ -273,6 +279,9 @@ designOpsLibrary <- function(df,
273 279
 #'     included in the library. 2 by default. 
274 280
 #' @param dist_method String specifying distance method. 
275 281
 #'     Must be either "hamming" (default) or "levenstein". 
282
+#' @param splitByChunks Should distances be calculated in a chunk-wise
283
+#'     manner? FALSE by default. Highly recommended when the set of query
284
+#'     barcodes is large to reduce memory footprint.
276 285
 #' 
277 286
 #' @author Jean-Philippe Fortin
278 287
 #'
... ...
@@ -308,7 +317,8 @@ updateOpsLibrary <- function(opsLibrary,
308 317
                              n_guides=4,
309 318
                              gene_field="gene",
310 319
                              min_dist_edit=2,
311
-                             dist_method=c("hamming","levenstein")
320
+                             dist_method=c("hamming","levenstein"),
321
+                             splitByChunks=FALSE
312 322
 ){
313 323
     dist_method <- match.arg(dist_method)
314 324
     df <- .validateOpsGrnaInput(df, gene_field)
... ...
@@ -321,7 +331,8 @@ updateOpsLibrary <- function(opsLibrary,
321 331
                                   gene_field=gene_field,
322 332
                                   n_guides=n_guides,
323 333
                                   dist_method=dist_method,
324
-                                  min_dist_edit=min_dist_edit)
334
+                                  min_dist_edit=min_dist_edit,
335
+                                  splitByChunks=splitByChunks)
325 336
     out <- .getFinalOpsLibrary(grnaList)
326 337
     out <- out[order(out[[gene_field]], out[["rank"]]),,drop=FALSE]
327 338
     return(out)
... ...
@@ -365,13 +376,15 @@ updateOpsLibrary <- function(opsLibrary,
365 376
 #' @importFrom Matrix rowSums
366 377
 .initiateOpsLibrary <- function(grnaList,
367 378
                                 dist_method,
368
-                                min_dist_edit
379
+                                min_dist_edit,
380
+                                splitByChunks
369 381
 ){
370 382
     selected <- grnaList[["selected"]]
371 383
     mat <- getBarcodeDistanceMatrix(queryBarcodes=selected[["opsBarcode"]],
372 384
                                     binnarize=TRUE,
373 385
                                     dist_method=dist_method,
374
-                                    min_dist_edit=min_dist_edit)
386
+                                    min_dist_edit=min_dist_edit,
387
+                                    splitByChunks=splitByChunks)
375 388
     good <- Matrix::rowSums(mat>0)==0
376 389
     # In case all guides are "bad", add first one only:
377 390
     if (sum(good)==0){
... ...
@@ -390,7 +403,8 @@ updateOpsLibrary <- function(opsLibrary,
390 403
                               gene_field,
391 404
                               n_guides,
392 405
                               dist_method,
393
-                              min_dist_edit
406
+                              min_dist_edit,
407
+                              splitByChunks
394 408
 ){
395 409
     shouldWeContinue <- TRUE
396 410
     while (shouldWeContinue){
... ...
@@ -399,7 +413,8 @@ updateOpsLibrary <- function(opsLibrary,
399 413
                                           gene_field=gene_field,
400 414
                                           n_guides=n_guides,
401 415
                                           dist_method=dist_method,
402
-                                          min_dist_edit=min_dist_edit)
416
+                                          min_dist_edit=min_dist_edit,
417
+                                          splitByChunks=splitByChunks)
403 418
         counts <- table(factor(grnaList[["selected"]][[gene_field]],
404 419
                           levels=grnaList[["genes"]]))
405 420
         incomplete <- names(which(counts<n_guides))
... ...
@@ -425,7 +440,8 @@ updateOpsLibrary <- function(opsLibrary,
425 440
                                   gene_field,
426 441
                                   n_guides,
427 442
                                   dist_method,
428
-                                  min_dist_edit
443
+                                  min_dist_edit,
444
+                                  splitByChunks
429 445
 ){
430 446
   
431 447
     .getCandidates <- function(genes, n){
... ...
@@ -451,7 +467,8 @@ updateOpsLibrary <- function(opsLibrary,
451 467
         # most divergent:
452 468
         dist <- getBarcodeDistanceMatrix(cands[["opsBarcode"]],
453 469
                                          dist_method=dist_method,
454
-                                         min_dist_edit=min_dist_edit)
470
+                                         min_dist_edit=min_dist_edit,
471
+                                         splitByChunks=splitByChunks)
455 472
         score <- Matrix::rowSums(dist>0)
456 473
         cands <- cands[order(score),,drop=FALSE]
457 474
 
... ...
@@ -494,7 +511,8 @@ updateOpsLibrary <- function(opsLibrary,
494 511
         dist <- getBarcodeDistanceMatrix(cands[["opsBarcode"]],
495 512
                                          lib[["opsBarcode"]],
496 513
                                          dist_method=dist_method,
497
-                                         min_dist_edit=min_dist_edit)
514
+                                         min_dist_edit=min_dist_edit,
515
+                                         splitByChunks=splitByChunks)
498 516
         cands <- cands[Matrix::rowSums(dist)==0,,drop=FALSE]
499 517
         grnaList <- .incrementalUpdate(grnaList, cands)
500 518
     }
... ...
@@ -9,7 +9,8 @@ designOpsLibrary(
9 9
   n_guides = 4,
10 10
   gene_field = "gene",
11 11
   min_dist_edit = 2,
12
-  dist_method = c("hamming", "levenstein")
12
+  dist_method = c("hamming", "levenstein"),
13
+  splitByChunks = FALSE
13 14
 )
14 15
 }
15 16
 \arguments{
... ...
@@ -29,6 +30,10 @@ included in the library. 2 by default.}
29 30
 
30 31
 \item{dist_method}{String specifying distance method. 
31 32
 Must be either "hamming" (default) or "levenstein".}
33
+
34
+\item{splitByChunks}{Should distances be calculated in a chunk-wise
35
+manner? FALSE by default. Highly recommended when the set of query
36
+barcodes is large to reduce memory footprint.}
32 37
 }
33 38
 \value{
34 39
 A subset of the \code{df} containing the gRNAs
... ...
@@ -37,7 +37,7 @@ diagonal distances be set to 0 to ignore self distances?
37 37
 TRUE by default.}
38 38
 
39 39
 \item{splitByChunks}{Should distances be calculated in a chunk-wise
40
-manner? TRUE by default. Highly recommended when the set of query
40
+manner? FALSE by default. Highly recommended when the set of query
41 41
 barcodes is large to reduce memory footprint.}
42 42
 
43 43
 \item{n_chunks}{Integer specifying the number of chunks to be used
... ...
@@ -10,7 +10,8 @@ updateOpsLibrary(
10 10
   n_guides = 4,
11 11
   gene_field = "gene",
12 12
   min_dist_edit = 2,
13
-  dist_method = c("hamming", "levenstein")
13
+  dist_method = c("hamming", "levenstein"),
14
+  splitByChunks = FALSE
14 15
 )
15 16
 }
16 17
 \arguments{
... ...
@@ -32,6 +33,10 @@ included in the library. 2 by default.}
32 33
 
33 34
 \item{dist_method}{String specifying distance method. 
34 35
 Must be either "hamming" (default) or "levenstein".}
36
+
37
+\item{splitByChunks}{Should distances be calculated in a chunk-wise
38
+manner? FALSE by default. Highly recommended when the set of query
39
+barcodes is large to reduce memory footprint.}
35 40
 }
36 41
 \value{
37 42
 A data.frame containing the original gRNAs from