Browse code

Merge branch 'fixCrispraiPy2' into main

fortinj2 authored on 24/07/2022 17:25:20
Showing 4 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: crisprScore
2
-Version: 1.1.11
2
+Version: 1.1.12
3 3
 Date: 2022-07-23
4 4
 Title: On-Target and Off-Target Scoring Algorithms for CRISPR gRNAs
5 5
 Authors@R: c(
... ...
@@ -34,8 +34,6 @@
34 34
 #' @param fastaFile String specifying fasta file of the hg38 genome.
35 35
 #' @param chromatinFiles Named character vector of length 3 specifying
36 36
 #'     BigWig files containing chromatin accessibility data. 
37
-#' @param fork Set to \code{TRUE} to preserve changes to the R
38
-#'     configuration within the session.
39 37
 #'     
40 38
 #' @details \code{tss_df} details:
41 39
 #'     This must be a \code{data.frame} that contains the following columns:
... ...
@@ -86,11 +84,8 @@ getCrispraiScores <- function(tss_df,
86 84
                               verbose=FALSE,
87 85
                               modality=c("CRISPRa", "CRISPRi"),
88 86
                               fastaFile=NULL,
89
-                              chromatinFiles=NULL,
90
-                              fork=FALSE
87
+                              chromatinFiles=NULL
91 88
 ){
92
-
93
-
94 89
     .checkChromatinFiles(chromatinFiles)
95 90
     .checkFastaFile(fastaFile)
96 91
     .checkTssFrame(tss_df)
... ...
@@ -101,165 +96,33 @@ getCrispraiScores <- function(tss_df,
101 96
                                    sgrna_df,
102 97
                                    verbose=verbose)
103 98
 
104
-    #dir <- system.file("crisprai",
105
-    #                   "temp_data",
106
-    #                   package="crisprScore",
107
-    #                   mustWork=TRUE)
108
-    #dir <- "/Users/fortinj2/crisprScore/inst/crisprai/temp_data"
109
-    #pickleFile <- paste0(dir, "/", modality, "_model.pkl")
110
-
111 99
     if (modality=="CRISPRa"){
112 100
         pickleFile <- crisprScoreData::CRISPRa_model.pkl()
113 101
     } else if (modality=="CRISPRi"){
114 102
         pickleFile <- crisprScoreData::CRISPRi_model.pkl()
115 103
     }
116 104
     
117
-
118
-    results <- basiliskRun(env=env_crisprai,
119
-                           shared=FALSE,
120
-                           fork=fork,
121
-                           fun=.pyPredictWeissmanScore,
122
-                           modality=modality,
123
-                           tssTable=inputList[["tssTable"]],
124
-                           p1p2Table=inputList[["p1p2Table"]],
125
-                           sgrnaTable=inputList[["sgrnaTable"]],
126
-                           libraryTable=inputList[["libraryTable"]],
127
-                           pickleFile=pickleFile,
128
-                           fastaFile=fastaFile,
129
-                           chromatinFiles=chromatinFiles,
130
-                           verbose=verbose)
131
-
105
+    results <- .pyPredictWeissmanScore(modality=modality,
106
+                                       tssTable=inputList[["tssTable"]],
107
+                                       p1p2Table=inputList[["p1p2Table"]],
108
+                                       sgrnaTable=inputList[["sgrnaTable"]],
109
+                                       libraryTable=inputList[["libraryTable"]],
110
+                                       pickleFile=pickleFile,
111
+                                       fastaFile=fastaFile,
112
+                                       chromatinFiles=chromatinFiles,
113
+                                       verbose=verbose)
114
+    rownames(results) <- sgrna_df[["grna_id"]]
132 115
     return(results)
133 116
 }
134 117
 
135 118
 
136
-.checkFastaFile <- function(fasta){
137
-    if (is.null(fasta)){
138
-        stop("Argument fasta cannot be NULL.")
139
-    }
140
-    if (length(fasta)!=1){
141
-        stop("fasta must be a character vector of length 1.")
142
-    }
143
-     if (!file.exists(fasta)){
144
-        stop("Fasta file cannot be found.")
145
-    }
146
-    invisible(TRUE)
147
-}
148
-
149
-.checkChromatinFiles <- function(chromatinFiles){
150
-    if (is.null(chromatinFiles)){
151
-        stop("Argument chromatinFiles cannot be NULL.")
152
-    }
153
-    if (length(chromatinFiles)!=3){
154
-        stop("chromatinFiles must be a character vector of length 3.")
155
-    }
156
-    choices <- c("mnase", "faire", "dnase")
157
-    if (!all(sort(names(chromatinFiles))==sort(names(choices)))){
158
-        stop("chromatinFiles must be a character vector with the",
159
-             " following names: mnase, faire, dnase.")
160
-    }
161
-    if (!sum(file.exists(chromatinFiles))==3){
162
-        stop("Some of the chromatin files cannot be found.")
163
-    }
164
-    invisible(TRUE)
165
-}
166
-
167
-
168
-
169
-
170
-
171
-
172
-.checkTssFrame <- function(tssFrame){
173
-    cols <- c("tss_id",
174
-              "gene_symbol",
175
-              "promoter",
176
-              "transcripts",
177
-              "position",
178
-              "strand",
179
-              "chr")
180
-    if (!all(cols %in% colnames(tssFrame))){
181
-        choices <- setdiff(cols, colnames(tssFrame))
182
-        stop("The following columns are missing in the tssFrame: \n \t",
183
-             paste0(choices, collapse=", "),".")
184
-    }
185
-    
186
-    if (sum(is.na(tssFrame$tss_id))>0){
187
-        stop("tss_id has some missing values.")
188
-    }
189
-    if (sum(is.na(tssFrame$gene_symbol))>0){
190
-        stop("gene_symbol has some missing values.")
191
-    }
192
-    if (sum(is.na(tssFrame$promoter))>0){
193
-        stop("promoter has some missing values.")
194
-    }
195
-    #if (sum(is.na(tssFrame$transcripts))>0){
196
-    #    stop("transcripts has some missing values.")
197
-    #}
198
-    if (sum(is.na(tssFrame$position))>0){
199
-        stop("position has some missing values.")
200
-    }
201
-    if (sum(is.na(tssFrame$strand))>0){
202
-        stop("strand has some missing values.")
203
-    }
204
-    if (sum(is.na(tssFrame$chr))>0){
205
-        stop("chr has some missing values.")
206
-    }
207
-
208
-    # Check promoters:
209
-    dfs <- split(tssFrame, f=tssFrame$gene_symbol)
210
-    lens <- vapply(dfs, function(df){
211
-        length(unique(df$strand))
212
-    }, FUN.VALUE=1)
213
-    if (any(lens>1)){
214
-        stop("Some genes have promoters with different strand directions.")
215
-    }
216
-    invisible(TRUE)
217
-}
218
-
219
-.checkGrnaFrame <- function(grnaFrame){
220
-    cols <- c("grna_id",
221
-              "tss_id",
222
-              "pam_site",
223
-              "strand", 
224
-              "spacer_19mer")
225
-    if (!all(cols %in% colnames(grnaFrame))){
226
-        choices <- setdiff(cols, colnames(grnaFrame))
227
-        stop("The following columns are missing in the grnaFrame: \n \t",
228
-             paste0(choices, collapse=", "),".")
229
-    }
230
-    if (sum(is.na(grnaFrame$grna_id))>0){
231
-        stop("grna_id has some missing values.")
232
-    }
233
-    if (sum(is.na(grnaFrame$tss_id))>0){
234
-        stop("tss_id has some missing values.")
235
-    }
236
-    if (sum(is.na(grnaFrame$pam_site))>0){
237
-        stop("pam_site has some missing values.")
238
-    }
239
-    if (sum(is.na(grnaFrame$strand))>0){
240
-        stop("strand has some missing values.")
241
-    }
242
-    if (sum(is.na(grnaFrame$spacer_19mer))>0){
243
-        stop("spacer_19mer has some missing values.")
244
-    }
245
-    lens <- unique(nchar(grnaFrame$spacer_19mer))
246
-    if (length(lens)!=1){
247
-        stop("Sequences specified in spacer_19mer must",
248
-             " all be of length 19.")
249
-    } else {
250
-        if (lens!=19){
251
-            stop("Sequences specified in spacer_19mer must",
252
-                 " all be of length 19.")
253
-        }
254
-    }
255
-    invisible(TRUE)
256
-}
257
-
258 119
 
259 120
 
260 121
 #' @importFrom reticulate import_from_path
261 122
 #' @importFrom reticulate py_suppress_warnings
262 123
 #' @importFrom reticulate r_to_py
124
+#' @importFrom basilisk.utils activateEnvironment
125
+#' @importFrom basilisk.utils deactivateEnvironment
263 126
 .pyPredictWeissmanScore <- function(modality,
264 127
                                     tssTable,
265 128
                                     p1p2Table,
... ...
@@ -271,38 +134,55 @@ getCrispraiScores <- function(tss_df,
271 134
                                     verbose
272 135
 
273 136
 ){
137
+    env <- basilisk::obtainEnvironmentPath(env_crisprai)
138
+    envls <- basilisk.utils::activateEnvironment(env)
139
+    on.exit(basilisk.utils::deactivateEnvironment(envls))
274 140
 
275 141
     if (.Platform$OS.type=="windows"){
276 142
         stop("CRISPRai is not available for Windows at the moment.")
277 143
     }
278 144
 
279
-    tssTable <- r_to_py(tssTable)
280
-    p1p2Table <- r_to_py(p1p2Table)
281
-    sgrnaTable <- r_to_py(sgrnaTable)
282
-    libraryTable <- r_to_py(libraryTable)
283
-    chromatinFiles <- r_to_py(chromatinFiles)
284
-
285
-    dir <- system.file("python",
286
-                       "crisprai",
287
-                       package="crisprScore",
288
-                       mustWork=TRUE)
289
-
290
-    pyWeissmanScore <- reticulate::import_from_path("predictWeissmanScores",
291
-                                                    path=dir)
292
-    
293
-    # TO DO: add chromatin file vectors and pickle file path vectors
294
-    
295
-    scores <- py_suppress_warnings(
296
-        pyWeissmanScore$predictWeissmanScore(tssTable=tssTable,
297
-                                             p1p2Table=p1p2Table,
298
-                                             sgrnaTable=sgrnaTable,
299
-                                             libraryTable=libraryTable,
300
-                                             modality=modality,
301
-                                             pickleFile=pickleFile,
302
-                                             fastaFile=fastaFile,
303
-                                             chromatinFiles=chromatinFiles,
304
-                                             verbose=verbose))
145
+    # We are going to save the tables to a temporary directory
146
+    .dumpToFile <- function(table, file){
147
+        write.table(table,
148
+                    sep="\t",
149
+                    file=file,
150
+                    quote=FALSE,
151
+                    col.names=TRUE,
152
+                    row.names=FALSE)
153
+    }
154
+    inputDir <- tempdir()
155
+    .dumpToFile(tssTable, file=file.path(inputDir, "tssTable.tsv"))
156
+    .dumpToFile(p1p2Table, file=file.path(inputDir, "p1p2Table.tsv"))
157
+    .dumpToFile(sgrnaTable, file=file.path(inputDir, "sgrnaTable.tsv"))
158
+    .dumpToFile(libraryTable, file=file.path(inputDir, "libraryTable.tsv"))
305 159
     
160
+
161
+    # Creating a roster file of all files:
162
+    roster <- data.frame(path=chromatinFiles)
163
+    roster$object <- names(chromatinFiles)
164
+    rownames(roster) <- NULL
165
+    roster <- roster[, c("object", "path")]
166
+    roster <- rbind(roster, c("fasta",fastaFile))
167
+    roster <- rbind(roster, c("pickleFile",pickleFile))
168
+    roster <- rbind(roster, c("tssTable", file.path(inputDir, "tssTable.tsv")))
169
+    roster <- rbind(roster, c("p1p2Table", file.path(inputDir, "p1p2Table.tsv")))
170
+    roster <- rbind(roster, c("sgrnaTable", file.path(inputDir, "sgrnaTable.tsv")))
171
+    roster <- rbind(roster, c("libraryTable", file.path(inputDir, "libraryTable.tsv")))
172
+    rosterFile <- file.path(inputDir, "roster.tsv")
173
+    .dumpToFile(roster, file=rosterFile)
174
+    outputFile <- file.path(inputDir, "scores.txt")
175
+
176
+    # Ready to call the python to generate the scores:
177
+    program <- system.file("python",
178
+                           "crisprai",
179
+                           "getWeissmanScores.py",
180
+                           package="crisprScore",
181
+                           mustWork=TRUE)
182
+    system2("python",
183
+            c(program, rosterFile, modality, outputFile, verbose))
184
+
185
+    scores <- read.table(outputFile)
306 186
     scores <- as.data.frame(scores)
307 187
     colnames(scores) <- c("score")
308 188
     return(scores)
... ...
@@ -619,3 +499,129 @@ getCrispraiScores <- function(tss_df,
619 499
     return(inputList)
620 500
 }
621 501
 
502
+
503
+
504
+
505
+.checkFastaFile <- function(fasta){
506
+    if (is.null(fasta)){
507
+        stop("Argument fasta cannot be NULL.")
508
+    }
509
+    if (length(fasta)!=1){
510
+        stop("fasta must be a character vector of length 1.")
511
+    }
512
+     if (!file.exists(fasta)){
513
+        stop("Fasta file cannot be found.")
514
+    }
515
+    invisible(TRUE)
516
+}
517
+
518
+.checkChromatinFiles <- function(chromatinFiles){
519
+    if (is.null(chromatinFiles)){
520
+        stop("Argument chromatinFiles cannot be NULL.")
521
+    }
522
+    if (length(chromatinFiles)!=3){
523
+        stop("chromatinFiles must be a character vector of length 3.")
524
+    }
525
+    choices <- c("mnase", "faire", "dnase")
526
+    if (!all(sort(names(chromatinFiles))==sort(names(choices)))){
527
+        stop("chromatinFiles must be a character vector with the",
528
+             " following names: mnase, faire, dnase.")
529
+    }
530
+    if (!sum(file.exists(chromatinFiles))==3){
531
+        stop("Some of the chromatin files cannot be found.")
532
+    }
533
+    invisible(TRUE)
534
+}
535
+
536
+
537
+
538
+
539
+
540
+
541
+.checkTssFrame <- function(tssFrame){
542
+    cols <- c("tss_id",
543
+              "gene_symbol",
544
+              "promoter",
545
+              "transcripts",
546
+              "position",
547
+              "strand",
548
+              "chr")
549
+    if (!all(cols %in% colnames(tssFrame))){
550
+        choices <- setdiff(cols, colnames(tssFrame))
551
+        stop("The following columns are missing in the tssFrame: \n \t",
552
+             paste0(choices, collapse=", "),".")
553
+    }
554
+    
555
+    if (sum(is.na(tssFrame$tss_id))>0){
556
+        stop("tss_id has some missing values.")
557
+    }
558
+    if (sum(is.na(tssFrame$gene_symbol))>0){
559
+        stop("gene_symbol has some missing values.")
560
+    }
561
+    if (sum(is.na(tssFrame$promoter))>0){
562
+        stop("promoter has some missing values.")
563
+    }
564
+    #if (sum(is.na(tssFrame$transcripts))>0){
565
+    #    stop("transcripts has some missing values.")
566
+    #}
567
+    if (sum(is.na(tssFrame$position))>0){
568
+        stop("position has some missing values.")
569
+    }
570
+    if (sum(is.na(tssFrame$strand))>0){
571
+        stop("strand has some missing values.")
572
+    }
573
+    if (sum(is.na(tssFrame$chr))>0){
574
+        stop("chr has some missing values.")
575
+    }
576
+
577
+    # Check promoters:
578
+    dfs <- split(tssFrame, f=tssFrame$gene_symbol)
579
+    lens <- vapply(dfs, function(df){
580
+        length(unique(df$strand))
581
+    }, FUN.VALUE=1)
582
+    if (any(lens>1)){
583
+        stop("Some genes have promoters with different strand directions.")
584
+    }
585
+    invisible(TRUE)
586
+}
587
+
588
+.checkGrnaFrame <- function(grnaFrame){
589
+    cols <- c("grna_id",
590
+              "tss_id",
591
+              "pam_site",
592
+              "strand", 
593
+              "spacer_19mer")
594
+    if (!all(cols %in% colnames(grnaFrame))){
595
+        choices <- setdiff(cols, colnames(grnaFrame))
596
+        stop("The following columns are missing in the grnaFrame: \n \t",
597
+             paste0(choices, collapse=", "),".")
598
+    }
599
+    if (sum(is.na(grnaFrame$grna_id))>0){
600
+        stop("grna_id has some missing values.")
601
+    }
602
+    if (sum(is.na(grnaFrame$tss_id))>0){
603
+        stop("tss_id has some missing values.")
604
+    }
605
+    if (sum(is.na(grnaFrame$pam_site))>0){
606
+        stop("pam_site has some missing values.")
607
+    }
608
+    if (sum(is.na(grnaFrame$strand))>0){
609
+        stop("strand has some missing values.")
610
+    }
611
+    if (sum(is.na(grnaFrame$spacer_19mer))>0){
612
+        stop("spacer_19mer has some missing values.")
613
+    }
614
+    lens <- unique(nchar(grnaFrame$spacer_19mer))
615
+    if (length(lens)!=1){
616
+        stop("Sequences specified in spacer_19mer must",
617
+             " all be of length 19.")
618
+    } else {
619
+        if (lens!=19){
620
+            stop("Sequences specified in spacer_19mer must",
621
+                 " all be of length 19.")
622
+        }
623
+    }
624
+    invisible(TRUE)
625
+}
626
+
627
+
622 628
similarity index 75%
623 629
rename from inst/python/crisprai/predictWeissmanScores.py
624 630
rename to inst/python/crisprai/getWeissmanScores.py
... ...
@@ -1,5 +1,11 @@
1
+#sys.argv[1] should be the path of the roster file specifying the input files
2
+#sys.argv[2] should be the modality
3
+#sys.argv[3] should be the output file
4
+#sys.argv[4] should be the verbose argument
5
+
1 6
 import cPickle
2 7
 from sgRNA_learning import *
8
+import pandas as pd
3 9
 
4 10
 
5 11
 def predictWeissmanScore(tssTable, p1p2Table, sgrnaTable, libraryTable, pickleFile, fastaFile, chromatinFiles, modality, verbose):
... ...
@@ -72,4 +78,30 @@ def getTransformedParams(paramTable, fitTable, estimators, verbose):
72 78
         colTups.append((l1,str(l2)))
73 79
     transformedParams_new.columns = pd.MultiIndex.from_tuples(colTups)
74 80
 
75
-    return transformedParams_new
76 81
\ No newline at end of file
82
+    return transformedParams_new
83
+
84
+
85
+
86
+# Ready to load the data from R
87
+roster_file = sys.argv[1]
88
+modality = sys.argv[2]
89
+output_file = sys.argv[3]
90
+verbose = sys.argv[4]
91
+
92
+roster = pd.read_csv(roster_file, sep='\t', header=0)
93
+roster = dict(zip(roster.object, roster.path))
94
+tssTable = pd.read_csv(roster["tssTable"],sep='\t', header=0)
95
+p1p2Table = pd.read_csv(roster["p1p2Table"],sep='\t', header=0)
96
+sgrnaTable = pd.read_csv(roster["sgrnaTable"],sep='\t', header=0)
97
+libraryTable = pd.read_csv(roster["libraryTable"],sep='\t', header=0)
98
+pickleFile = roster["pickleFile"]
99
+fastaFile = roster["fasta"]
100
+
101
+keys = ["dnase", "faire", "mnase"]
102
+chromatinFiles = [roster.get(key) for key in keys]
103
+
104
+
105
+
106
+scores = predictWeissmanScore(tssTable, p1p2Table, sgrnaTable, libraryTable, pickleFile, fastaFile, chromatinFiles, modality, verbose)
107
+np.savetxt(output_file, scores)
108
+
... ...
@@ -10,8 +10,7 @@ getCrispraiScores(
10 10
   verbose = FALSE,
11 11
   modality = c("CRISPRa", "CRISPRi"),
12 12
   fastaFile = NULL,
13
-  chromatinFiles = NULL,
14
-  fork = FALSE
13
+  chromatinFiles = NULL
15 14
 )
16 15
 }
17 16
 \arguments{
... ...
@@ -36,9 +35,6 @@ TRUE by default.}
36 35
 
37 36
 \item{chromatinFiles}{Named character vector of length 3 specifying
38 37
 BigWig files containing chromatin accessibility data.}
39
-
40
-\item{fork}{Set to \code{TRUE} to preserve changes to the R
41
-configuration within the session.}
42 38
 }
43 39
 \value{
44 40
 \strong{getCrispraiScores} returns a \code{data.frame} with