Browse code

Use subsetSCECols to subset data in example

Yusuke Koga authored on 15/10/2020 17:11:54
Showing 36 changed files

... ...
@@ -49,14 +49,14 @@
49 49
 #' @param logfile Character. Messages will be redirected to a file named
50 50
 #'  `logfile`. If NULL, messages will be printed to stdout.  Default NULL.
51 51
 #' @param verbose Logical. Whether to print log messages. Default TRUE.
52
-#' 
52
+#'
53 53
 #' @return A \link[SingleCellExperiment]{SingleCellExperiment} object with
54 54
 #'  'decontX_Contamination' and 'decontX_Clusters' added to the
55 55
 #'  \link[SummarizedExperiment]{colData} slot. Additionally, the
56 56
 #' decontaminated counts will be added as an assay called 'decontXCounts'.
57 57
 #' @examples
58 58
 #' data(scExample, package = "singleCellTK")
59
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
59
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
60 60
 #' sce <- runDecontX(sce)
61 61
 #' @export
62 62
 runDecontX <- function(inSCE,
... ...
@@ -84,8 +84,8 @@ runDecontX <- function(inSCE,
84 84
 
85 85
   message(paste0(date(), " ... Running 'DecontX'"))
86 86
 
87
-  inSCE <- celda::decontX(x = inSCE, 
88
-                          batch = sample, 
87
+  inSCE <- celda::decontX(x = inSCE,
88
+                          batch = sample,
89 89
                           assayName = useAssay,
90 90
                           z = z,
91 91
                           maxIter = maxIter,
... ...
@@ -100,7 +100,7 @@ runDecontX <- function(inSCE,
100 100
                           verbose = verbose)
101 101
 
102 102
   #argsList <- argsList[!names(argsList) %in% ("...")]
103
-  
103
+
104 104
   inSCE@metadata$runDecontX <- argsList[-1]
105 105
   inSCE@metadata$runDecontX$packageVersion <- utils::packageDescription("celda")$Version
106 106
 
... ...
@@ -330,7 +330,7 @@
330 330
 #'  'doublet_finder_doublet_score'.
331 331
 #' @examples
332 332
 #' data(scExample, package = "singleCellTK")
333
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
333
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
334 334
 #' sce <- runDoubletFinder(sce)
335 335
 #' @export
336 336
 #' @importFrom SummarizedExperiment colData colData<-
... ...
@@ -1,115 +1,115 @@
1
-#' Uniform Manifold Approximation and Projection(UMAP) algorithm for
2
-#' dimension reduction.
3
-#'
4
-#' @param inSCE Input \linkS4class{SingleCellExperiment} object.
5
-#' @param useAssay Indicate which assay to use. The default is "counts".
6
-#' @param sample Character vector. Indicates which sample each cell belongs to.
7
-#' @param reducedDimName a name to store the results of the dimension reduction
8
-#' coordinates obtained from this method. This is stored in the SingleCellExperiment
9
-#' object in the reducedDims slot. Default "UMAP".
10
-#' @param logNorm Whether the counts will need to be log-normalized prior to
11
-#' generating the UMAP via scater::logNormCounts. Default TRUE.
12
-#' @param nNeighbors The size of local neighborhood used for
13
-#'   manifold approximation. Larger values result in more global
14
-#'   views of the manifold, while smaller values result in more
15
-#'   local data being preserved. Default 30.
16
-#'    See `?uwot::umap` for more information.
17
-#' @param nIterations number of iterations performed during layout optimization.
18
-#' Default is 200.
19
-#' @param alpha initial value of "learning rate" of layout optimization. Default is 1.
20
-#' @param minDist The effective minimum distance between embedded points.
21
-#'    Smaller values will result in a more clustered/clumped
22
-#'    embedding where nearby points on the manifold are drawn
23
-#'    closer together, while larger values will result on a more
24
-#'    even dispersal of points. Default 0.01.
25
-#'    See `?uwot::umap` for more information.
26
-#' @param spread The effective scale of embedded points. In combination with
27
-#'    ‘min_dist’, this determines how clustered/clumped the
28
-#'    embedded points are. Default 1.
29
-#'    See `?uwot::umap` for more information.
30
-#' @param pca Logical. Whether to perform dimensionality reduction with PCA
31
-#' before UMAP.
32
-#' @param initialDims  Number of dimensions from PCA to use as
33
-#' input in UMAP. Default 50.
34
-#'
35
-#' @return A \linkS4class{SingleCellExperiment} object with the reduced
36
-#' dimensions updated under reducedDimName specified.
37
-#' @export
38
-#'
39
-#' @examples
40
-#' data(scExample, package = "singleCellTK")
41
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
42
-#' umap_res <- getUMAP(inSCE = sce, useAssay = "counts",
43
-#'                     reducedDimName = "UMAP", logNorm = TRUE,
44
-#'                     nNeighbors = 30, alpha = 1,
45
-#'                     nIterations = 200, spread = 1, pca = TRUE,
46
-#'                     initialDims = 50)
47
-#' reducedDims(umap_res)
48
-
49
-getUMAP <- function(inSCE, useAssay = "counts",
50
-                    sample = NULL,
51
-                    reducedDimName = "UMAP",
52
-                    logNorm = TRUE,
53
-                    nNeighbors = 30,
54
-                    nIterations = 200,
55
-                    alpha = 1,
56
-                    minDist = 0.01,
57
-                    spread = 1,
58
-                    pca = TRUE,
59
-                    initialDims = 50) {
60
-  if (!inherits(inSCE, "SingleCellExperiment")){
61
-    stop("Please use a SingleCellExperiment object")
62
-  }
63
-  #test for assay existing
64
-    if (!all(useAssay %in% names(assays(inSCE)))){
65
-        stop("assay '", useAssay, "' does not exist.")
66
-    }
67
-
68
-    if(!is.null(sample)) {
69
-        if(length(sample) != ncol(inSCE)) {
70
-            stop("'sample' must be the same length as the number of columns in 'inSCE'")
71
-        }
72
-    } else {
73
-        sample = rep(1, ncol(inSCE))
74
-    }
75
-    samples <- unique(sample)
76
-    umapDims = matrix(nrow = ncol(inSCE), ncol = 2)
77
-    for (i in seq_len(length(samples))){
78
-        useAssayTemp = useAssay
79
-	sceSampleInd <- sample == samples[i]
80
-        sceSample <- inSCE[, sceSampleInd]
81
-        if(logNorm){
82
-	    sceSample <- scater_logNormCounts(sceSample, useAssay = useAssay)
83
-            useAssayTemp = "ScaterLogNormCounts"
84
-        }
85
-
86
-        matColData <- SummarizedExperiment::assay(sceSample, useAssayTemp)
87
-        matColData <- as.matrix(matColData)
88
-
89
-        if (isTRUE(pca)) {
90
-          if(initialDims > ncol(matColData)){
91
-            doPCA <- ncol(matColData)
92
-          }else{
93
-            doPCA <- initialDims
94
-          }
95
-        } else {
96
-            doPCA <- NULL
97
-        }
98
-        if(nNeighbors > ncol(matColData)){
99
-          nNeighbors <- ncol(matColData)
100
-        }
101
-
102
-        umapRes <- uwot::umap(t(matColData), n_neighbors = nNeighbors,
103
-                              learning_rate = alpha,
104
-                              min_dist = minDist, spread = spread,
105
-                              n_sgd_threads = 1, pca = doPCA,
106
-                              n_epochs = nIterations)
107
-        if (is.null(rownames(sceSample))) {
108
-            rownames(umapRes) <- colnames(sceSample)
109
-        }
110
-        umapDims[sceSampleInd, ] = umapRes
111
-    }
112
-    colnames(umapDims) <- c("UMAP1", "UMAP2")
113
-    SingleCellExperiment::reducedDim(inSCE, reducedDimName) <- umapDims
114
-    return(inSCE)
115
-}
1
+#' Uniform Manifold Approximation and Projection(UMAP) algorithm for
2
+#' dimension reduction.
3
+#'
4
+#' @param inSCE Input \linkS4class{SingleCellExperiment} object.
5
+#' @param useAssay Indicate which assay to use. The default is "counts".
6
+#' @param sample Character vector. Indicates which sample each cell belongs to.
7
+#' @param reducedDimName a name to store the results of the dimension reduction
8
+#' coordinates obtained from this method. This is stored in the SingleCellExperiment
9
+#' object in the reducedDims slot. Default "UMAP".
10
+#' @param logNorm Whether the counts will need to be log-normalized prior to
11
+#' generating the UMAP via scater::logNormCounts. Default TRUE.
12
+#' @param nNeighbors The size of local neighborhood used for
13
+#'   manifold approximation. Larger values result in more global
14
+#'   views of the manifold, while smaller values result in more
15
+#'   local data being preserved. Default 30.
16
+#'    See `?uwot::umap` for more information.
17
+#' @param nIterations number of iterations performed during layout optimization.
18
+#' Default is 200.
19
+#' @param alpha initial value of "learning rate" of layout optimization. Default is 1.
20
+#' @param minDist The effective minimum distance between embedded points.
21
+#'    Smaller values will result in a more clustered/clumped
22
+#'    embedding where nearby points on the manifold are drawn
23
+#'    closer together, while larger values will result on a more
24
+#'    even dispersal of points. Default 0.01.
25
+#'    See `?uwot::umap` for more information.
26
+#' @param spread The effective scale of embedded points. In combination with
27
+#'    ‘min_dist’, this determines how clustered/clumped the
28
+#'    embedded points are. Default 1.
29
+#'    See `?uwot::umap` for more information.
30
+#' @param pca Logical. Whether to perform dimensionality reduction with PCA
31
+#' before UMAP.
32
+#' @param initialDims  Number of dimensions from PCA to use as
33
+#' input in UMAP. Default 50.
34
+#'
35
+#' @return A \linkS4class{SingleCellExperiment} object with the reduced
36
+#' dimensions updated under reducedDimName specified.
37
+#' @export
38
+#'
39
+#' @examples
40
+#' data(scExample, package = "singleCellTK")
41
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
42
+#' umap_res <- getUMAP(inSCE = sce, useAssay = "counts",
43
+#'                     reducedDimName = "UMAP", logNorm = TRUE,
44
+#'                     nNeighbors = 30, alpha = 1,
45
+#'                     nIterations = 200, spread = 1, pca = TRUE,
46
+#'                     initialDims = 50)
47
+#' reducedDims(umap_res)
48
+
49
+getUMAP <- function(inSCE, useAssay = "counts",
50
+                    sample = NULL,
51
+                    reducedDimName = "UMAP",
52
+                    logNorm = TRUE,
53
+                    nNeighbors = 30,
54
+                    nIterations = 200,
55
+                    alpha = 1,
56
+                    minDist = 0.01,
57
+                    spread = 1,
58
+                    pca = TRUE,
59
+                    initialDims = 50) {
60
+  if (!inherits(inSCE, "SingleCellExperiment")){
61
+    stop("Please use a SingleCellExperiment object")
62
+  }
63
+  #test for assay existing
64
+    if (!all(useAssay %in% names(assays(inSCE)))){
65
+        stop("assay '", useAssay, "' does not exist.")
66
+    }
67
+
68
+    if(!is.null(sample)) {
69
+        if(length(sample) != ncol(inSCE)) {
70
+            stop("'sample' must be the same length as the number of columns in 'inSCE'")
71
+        }
72
+    } else {
73
+        sample = rep(1, ncol(inSCE))
74
+    }
75
+    samples <- unique(sample)
76
+    umapDims = matrix(nrow = ncol(inSCE), ncol = 2)
77
+    for (i in seq_len(length(samples))){
78
+        useAssayTemp = useAssay
79
+	sceSampleInd <- sample == samples[i]
80
+        sceSample <- inSCE[, sceSampleInd]
81
+        if(logNorm){
82
+	    sceSample <- scater_logNormCounts(sceSample, useAssay = useAssay)
83
+            useAssayTemp = "ScaterLogNormCounts"
84
+        }
85
+
86
+        matColData <- SummarizedExperiment::assay(sceSample, useAssayTemp)
87
+        matColData <- as.matrix(matColData)
88
+
89
+        if (isTRUE(pca)) {
90
+          if(initialDims > ncol(matColData)){
91
+            doPCA <- ncol(matColData)
92
+          }else{
93
+            doPCA <- initialDims
94
+          }
95
+        } else {
96
+            doPCA <- NULL
97
+        }
98
+        if(nNeighbors > ncol(matColData)){
99
+          nNeighbors <- ncol(matColData)
100
+        }
101
+
102
+        umapRes <- uwot::umap(t(matColData), n_neighbors = nNeighbors,
103
+                              learning_rate = alpha,
104
+                              min_dist = minDist, spread = spread,
105
+                              n_sgd_threads = 1, pca = doPCA,
106
+                              n_epochs = nIterations)
107
+        if (is.null(rownames(sceSample))) {
108
+            rownames(umapRes) <- colnames(sceSample)
109
+        }
110
+        umapDims[sceSampleInd, ] = umapRes
111
+    }
112
+    colnames(umapDims) <- c("UMAP1", "UMAP2")
113
+    SingleCellExperiment::reducedDim(inSCE, reducedDimName) <- umapDims
114
+    return(inSCE)
115
+}
... ...
@@ -47,7 +47,7 @@ reportDropletQC <- function(inSCE, output_file = NULL,
47 47
 #' @return .html file
48 48
 #' @examples
49 49
 #' data(scExample, package = "singleCellTK")
50
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
50
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
51 51
 #' \dontrun{
52 52
 #' sce <- runCellQC(sce)
53 53
 #' reportCellQC(inSCE = sce)
... ...
@@ -84,7 +84,7 @@ reportCellQC <- function(inSCE, output_file = NULL,
84 84
 #' @return .html file
85 85
 #' @examples
86 86
 #' data(scExample, package = "singleCellTK")
87
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
87
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
88 88
 #' \donttest{
89 89
 #' sce <- runDecontX(sce)
90 90
 #' sce <- getUMAP(sce)
... ...
@@ -165,12 +165,6 @@ reportQCTool <- function(inSCE, algorithm=c("BarcodeRankDrops",
165 165
 #' @param output_dir name of the output directory to save the rendered file. If
166 166
 #' \code{NULL} the file is stored to the current working directory.
167 167
 #' Default \code{NULL}.
168
-#' @examples
169
-#' data(scExample, package = "singleCellTK")
170
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
171
-#' sce <- runDEAnalysis(inSCE = sce, groupName1 = "Sample1", method = "DESeq2",
172
-#'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "DESeq2")
173
-#' reportDiffExp(sce, study = "DESeq2", output_file = "DESeq2_res")
174 168
 #' @return .html file
175 169
 #' @export
176 170
 reportDiffExp <- function(inSCE, study,
... ...
@@ -262,7 +262,7 @@ plotDEGRegression <- function(inSCE, useResult, threshP = FALSE, labelBy = NULL,
262 262
 #' @examples
263 263
 #' data(scExample, package = "singleCellTK")
264 264
 #' \dontrun{
265
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
265
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
266 266
 #' sce <- runDEAnalysis(inSCE = sce, groupName1 = "Sample1", method = "DESeq2",
267 267
 #'  groupName2 = "Sample2", index1 = 1:100, index2 = 101:190, analysisName = "DESeq2")
268 268
 #' plotDEGHeatmap(sce, useResult = "DESeq2", fdrThreshold = 1)
... ...
@@ -177,7 +177,7 @@ dataAnnotationColor <- function(inSCE, axis = NULL,
177 177
 #' @param ... Other arguments passed to \code{\link[ComplexHeatmap]{Heatmap}}.
178 178
 #' @examples
179 179
 #' data(scExample, package = "singleCellTK")
180
-#' plotSCEHeatmap(sce[1:3,1:3])
180
+#' plotSCEHeatmap(sce[1:3,1:3], useAssay = "counts")
181 181
 #' @return A \code{\link[ComplexHeatmap]{Heatmap}} object
182 182
 #' @export
183 183
 #' @author Yichen Wang
... ...
@@ -15,7 +15,7 @@
15 15
 #'
16 16
 #' @examples
17 17
 #' data(scExample, package = "singleCellTK")
18
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
18
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
19 19
 #' sce <- getUMAP(inSCE = sce, useAssay = "counts", reducedDimName = "UMAP")
20 20
 #' plotUMAP(sce, shape = "No Shape", reducedDimName = "UMAP",
21 21
 #'          runUMAP = TRUE, useAssay = "counts")
... ...
@@ -127,7 +127,7 @@
127 127
 #' \code{\link{runANOVA}}
128 128
 #' @examples
129 129
 #' data(scExample, package = "singleCellTK")
130
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
130
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
131 131
 #' sce <- runDEAnalysis(inSCE = sce, groupName1 = "Sample1", method = "DESeq2",
132 132
 #'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "DESeq2")
133 133
 #' @return Input SCE object with \code{metadata(inSCE)} updated with name
... ...
@@ -190,7 +190,7 @@ runDEAnalysis <- function(method = 'MAST', ...){
190 190
 #' Default \code{FALSE}.
191 191
 #' @examples
192 192
 #' data(scExample, package = "singleCellTK")
193
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
193
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
194 194
 #' sce <- runDESeq2(inSCE = sce, groupName1 = "Sample1",
195 195
 #'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "DESeq2")
196 196
 #'
... ...
@@ -317,7 +317,7 @@ runDESeq2 <- function(inSCE, useAssay = 'counts', index1 = NULL,
317 317
 #' Default \code{FALSE}.
318 318
 #' @examples
319 319
 #' data(scExample, package = "singleCellTK")
320
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
320
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
321 321
 #' sce@assays@data$logcounts <- log10(counts(sce) + 1)
322 322
 #' sce <- runLimmaDE(inSCE = sce, groupName1 = "Sample1",
323 323
 #'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "Limma")
... ...
@@ -440,7 +440,7 @@ runLimmaDE <- function(inSCE, useAssay = 'logcounts', index1 = NULL,
440 440
 #' Default \code{FALSE}.
441 441
 #' @examples
442 442
 #' data(scExample, package = "singleCellTK")
443
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
443
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
444 444
 #' sce@assays@data$logcounts <- log10(counts(sce) + 1)
445 445
 #' sce <- runANOVA(inSCE = sce, groupName1 = "Sample1",
446 446
 #'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40,
... ...
@@ -586,7 +586,7 @@ runANOVA <- function(inSCE, useAssay = 'logcounts', index1 = NULL,
586 586
 #' Default \code{FALSE}.
587 587
 #' @examples
588 588
 #' data(scExample, package = "singleCellTK")
589
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
589
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
590 590
 #' sce@assays@data$logcounts <- log10(counts(sce) + 1)
591 591
 #' sce <- runMAST(inSCE = sce, groupName1 = "Sample1",
592 592
 #'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "MAST")
... ...
@@ -20,7 +20,7 @@
20 20
 #' of \code{inSCE}.
21 21
 #' @examples
22 22
 #' data(scExample, package = "singleCellTK")
23
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
23
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
24 24
 #' \donttest{
25 25
 #' sce <- runCellQC(sce)
26 26
 #' }
... ...
@@ -118,7 +118,7 @@
118 118
 #' QC stats stored in the inSCE.
119 119
 #' @examples
120 120
 #' data(scExample, package = "singleCellTK")
121
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
121
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
122 122
 #' sampleSummaryStats(sce, simple = TRUE)
123 123
 #' @importFrom magrittr %>%
124 124
 #' @export
... ...
@@ -23,7 +23,7 @@
23 23
 #'  Please refer to the documentation of \link[scds]{cxds} for details.
24 24
 #' @examples
25 25
 #' data(scExample, package = "singleCellTK")
26
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
26
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
27 27
 #' sce <- runCxds(sce)
28 28
 #' @export
29 29
 #' @importFrom SummarizedExperiment colData colData<-
... ...
@@ -31,9 +31,9 @@
31 31
 runCxds <- function(inSCE,
32 32
     sample = NULL,
33 33
     seed = 12345,
34
-    ntop = 500, 
34
+    ntop = 500,
35 35
     binThresh = 0,
36
-    verb = FALSE, 
36
+    verb = FALSE,
37 37
     retRes = FALSE,
38 38
     estNdbl = FALSE,
39 39
     useAssay = "counts") {
... ...
@@ -48,7 +48,7 @@ runCxds <- function(inSCE,
48 48
     }
49 49
 
50 50
     message(paste0(date(), " ... Running 'cxds'"))
51
-    
51
+
52 52
     ## Getting current arguments
53 53
     #argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame()))
54 54
     argsList <- mget(names(formals()),sys.frame(sys.nframe()))
... ...
@@ -71,19 +71,19 @@ runCxds <- function(inSCE,
71 71
 
72 72
         mat <- SummarizedExperiment::assay(sceSample, i = useAssay)
73 73
         counts(sceSample) <- .convertToMatrix(mat)
74
-        
74
+
75 75
         result <- NULL
76 76
         nGene <- 500
77 77
         while(!inherits(result, "SingleCellExperiment") & nGene > 0) {
78 78
           try({result <- withr::with_seed(seed, scds::cxds(sce = sceSample,
79
-                                                           ntop = nGene, 
80
-                                                           binThresh = binThresh, 
81
-                                                           verb = verb, 
79
+                                                           ntop = nGene,
80
+                                                           binThresh = binThresh,
81
+                                                           verb = verb,
82 82
                                                            retRes = retRes,
83 83
                                                            estNdbl = estNdbl))}, silent = TRUE)
84 84
           nGene <- nGene - 100
85
-        }  
86
-        
85
+        }
86
+
87 87
         if (!inherits(result, "try-error") & !is.null(result)) {
88 88
           if ("cxds_call" %in% colnames(SummarizedExperiment::colData(result))) {
89 89
               output[sceSampleInd, ] <- SummarizedExperiment::colData(result)[,
... ...
@@ -95,12 +95,12 @@ runCxds <- function(inSCE,
95 95
         } else {
96 96
           output[sceSampleInd, ] <- NA
97 97
           warning(paste0("'cxds' from package 'scds' did not complete successfully for sample", samples[i]))
98
-        }            
98
+        }
99 99
     }
100 100
 
101 101
     colnames(output) <- paste0("scds_", colnames(output))
102 102
     colData(inSCE) = cbind(colData(inSCE), output)
103
-    
103
+
104 104
     inSCE@metadata$runCxds <- argsList[-1]
105 105
     inSCE@metadata$runCxds$packageVersion <- utils::packageDescription("scds")$Version
106 106
 
... ...
@@ -120,7 +120,7 @@ runCxds <- function(inSCE,
120 120
 #'  separately. If NULL, then all cells will be processed together.
121 121
 #'  Default NULL.
122 122
 #' @param seed Seed for the random number generator. Default 12345.
123
-#' @param ntop See \link[scds]{bcds} for more information. Default \code{500}. 
123
+#' @param ntop See \link[scds]{bcds} for more information. Default \code{500}.
124 124
 #' @param srat See \link[scds]{bcds} for more information. Default \code{1}.
125 125
 #' @param verb See \link[scds]{bcds} for more information. Default \code{FALSE}.
126 126
 #' @param retRes See \link[scds]{bcds} for more information. Default \code{FALSE}.
... ...
@@ -135,7 +135,7 @@ runCxds <- function(inSCE,
135 135
 #'  Please refer to the documentation of \link[scds]{bcds} for details.
136 136
 #' @examples
137 137
 #' data(scExample, package = "singleCellTK")
138
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
138
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
139 139
 #' sce <- runBcds(sce)
140 140
 #' @export
141 141
 #' @importFrom SummarizedExperiment colData colData<-
... ...
@@ -143,11 +143,11 @@ runCxds <- function(inSCE,
143 143
 runBcds <- function(inSCE,
144 144
     sample = NULL,
145 145
     seed = 12345,
146
-    ntop = 500, 
147
-    srat = 1, 
146
+    ntop = 500,
147
+    srat = 1,
148 148
     verb = FALSE,
149 149
     retRes = FALSE,
150
-    nmax = "tune", 
150
+    nmax = "tune",
151 151
     varImp = FALSE,
152 152
     estNdbl = FALSE,
153 153
     useAssay = "counts"
... ...
@@ -163,7 +163,7 @@ runBcds <- function(inSCE,
163 163
     }
164 164
 
165 165
     message(paste0(date(), " ... Running 'bcds'"))
166
-    
166
+
167 167
     ## Getting current arguments
168 168
     #argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame()))
169 169
     argsList <- mget(names(formals()),sys.frame(sys.nframe()))
... ...
@@ -186,22 +186,22 @@ runBcds <- function(inSCE,
186 186
 
187 187
         mat <- SummarizedExperiment::assay(sceSample, i = useAssay)
188 188
         counts(sceSample) <- .convertToMatrix(mat)
189
-        
189
+
190 190
         result <- NULL
191 191
         nGene <- 500
192 192
         while(!inherits(result, "SingleCellExperiment") & nGene > 0) {
193 193
           try({result <- withr::with_seed(seed,
194 194
             scds::bcds(sce = sceSample,
195 195
                        ntop = nGene,
196
-                       srat = srat, 
196
+                       srat = srat,
197 197
                        verb = verb,
198 198
                        retRes = retRes,
199
-                       nmax = nmax, 
199
+                       nmax = nmax,
200 200
                        varImp = varImp,
201 201
                        estNdbl = estNdbl
202 202
             ))}, silent = TRUE)
203 203
           nGene <- nGene - 100
204
-        }  
204
+        }
205 205
 
206 206
         if (!inherits(result, "try-error") & !is.null(result)) {
207 207
           if ("bcds_call" %in% colnames(SummarizedExperiment::colData(result))) {
... ...
@@ -214,13 +214,13 @@ runBcds <- function(inSCE,
214 214
         } else {
215 215
           output[sceSampleInd, ] <- NA
216 216
           warning(paste0("'bcds' from package 'scds' did not complete successfully for sample", samples[i]))
217
-        }  
218
- 
217
+        }
218
+
219 219
     }
220 220
 
221 221
     colnames(output) <- paste0("scds_", colnames(output))
222 222
     colData(inSCE) = cbind(colData(inSCE), output)
223
-    
223
+
224 224
     inSCE@metadata$runBcds <- argsList[-1]
225 225
     inSCE@metadata$runBcds$packageVersion <- utils::packageDescription("scds")$Version
226 226
 
... ...
@@ -241,7 +241,7 @@ runBcds <- function(inSCE,
241 241
 #'  Default NULL.
242 242
 #' @param seed Seed for the random number generator. Default 12345.
243 243
 #' @param nTop The number of top varialbe genes to consider. Used in both \code{csds}
244
-#' and \code{bcds}. Default \code{500}. 
244
+#' and \code{bcds}. Default \code{500}.
245 245
 #' @param cxdsArgs See \link[scds]{cxds_bcds_hybrid} for more information. Default \code{NULL}.
246 246
 #' @param bcdsArgs See \link[scds]{cxds_bcds_hybrid} for more information. Default \code{NULL}.
247 247
 #' @param verb See \link[scds]{cxds_bcds_hybrid} for more information. Default \code{FALSE}.
... ...
@@ -256,7 +256,7 @@ runBcds <- function(inSCE,
256 256
 #'  details.
257 257
 #' @examples
258 258
 #' data(scExample, package = "singleCellTK")
259
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
259
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
260 260
 #' sce <- runCxdsBcdsHybrid(sce)
261 261
 #' @export
262 262
 #' @importFrom SummarizedExperiment colData colData<-
... ...
@@ -266,7 +266,7 @@ runCxdsBcdsHybrid <- function(inSCE,
266 266
     seed = 12345,
267 267
     nTop = 500,
268 268
     cxdsArgs = list(),
269
-    bcdsArgs = list(), 
269
+    bcdsArgs = list(),
270 270
     verb = FALSE,
271 271
     estNdbl = FALSE,
272 272
     force = FALSE,
... ...
@@ -309,14 +309,14 @@ runCxdsBcdsHybrid <- function(inSCE,
309 309
         result <- NULL
310 310
         nGene <- 500
311 311
         while(!inherits(result, "SingleCellExperiment") & nGene > 0) {
312
-          try({result <- withr::with_seed(seed, scds::cxds_bcds_hybrid(sce = sceSample, 
313
-                                                                       cxdsArgs=c(list(ntop = nGene), cxdsArgs), 
314
-                                                                       bcdsArgs=c(list(ntop = nGene), bcdsArgs), 
312
+          try({result <- withr::with_seed(seed, scds::cxds_bcds_hybrid(sce = sceSample,
313
+                                                                       cxdsArgs=c(list(ntop = nGene), cxdsArgs),
314
+                                                                       bcdsArgs=c(list(ntop = nGene), bcdsArgs),
315 315
                                                                        verb = verb,
316 316
                                                                        estNdbl = estNdbl,
317 317
                                                                        force = force))}, silent = TRUE)
318 318
           nGene <- nGene - 100
319
-        }  
319
+        }
320 320
 
321 321
         if (!inherits(result, "try-error") & !is.null(result)) {
322 322
           if ("hybrid_call" %in% colnames(SummarizedExperiment::colData(result))) {
... ...
@@ -329,12 +329,12 @@ runCxdsBcdsHybrid <- function(inSCE,
329 329
         } else {
330 330
           output[sceSampleInd, ] <- NA
331 331
           warning(paste0("'cxds_bcds_hybrid' from package 'scds' did not complete successfully for sample", samples[i]))
332
-        }   
332
+        }
333 333
     }
334 334
 
335 335
     colnames(output) <- paste0("scds_", colnames(output))
336 336
     colData(inSCE) = cbind(colData(inSCE), output)
337
-    
337
+
338 338
     inSCE@metadata$runCxdsBcdsHybrid <- argsList[-1]
339 339
     inSCE@metadata$runCxdsBcdsHybrid$packageVersion <- utils::packageDescription("scds")$Version
340 340
 
... ...
@@ -88,7 +88,7 @@
88 88
 #' @seealso \link[scran]{doubletCells}
89 89
 #' @examples
90 90
 #' data(scExample, package = "singleCellTK")
91
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
91
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
92 92
 #' sce <- runDoubletCells(sce)
93 93
 #' @export
94 94
 #' @importFrom SummarizedExperiment colData colData<-
... ...
@@ -1,231 +1,231 @@
1
-#' @title Find doublets using \code{scrublet}.
2
-#' @description A wrapper function that calls \code{scrub_doublets} from python
3
-#'  module \code{scrublet}. Simulates doublets from the observed data and uses
4
-#'  a k-nearest-neighbor classifier to calculate a continuous
5
-#'  \code{scrublet_score} (between 0 and 1) for each transcriptome. The score
6
-#'  is automatically thresholded to generate \code{scrublet_call}, a boolean
7
-#'  array that is \code{TRUE} for predicted doublets and \code{FALSE}
8
-#'  otherwise.
9
-#' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object.
10
-#'  Needs \code{counts} in assays slot.
11
-#' @param sample Character vector. Indicates which sample each cell belongs to.
12
-#'  Scrublet will be run on cells from each sample separately. If NULL, then
13
-#'  all cells will be processed together. Default \code{NULL}.
14
-#' @param useAssay  A string specifying which assay in the SCE to use. Default
15
-#'  'counts'.
16
-#' @param simDoubletRatio Numeric. Number of doublets to simulate relative to
17
-#'  the number of observed transcriptomes. Default 2.0.
18
-#' @param nNeighbors Integer. Number of neighbors used to construct the KNN
19
-#'  graph of observed transcriptomes and simulated doublets. If \code{NULL},
20
-#'  this is set to \code{round(0.5 * sqrt(n_cells))}. Default \code{NULL}.
21
-#' @param minDist Float Determines how tightly UMAP packs points together. If \code{NULL},
22
-#'  this is set to 0.1. Default \code{NULL}.
23
-#' @param expectedDoubletRate The estimated doublet rate for the experiment.
24
-#'  Default 0.1.
25
-#' @param stdevDoubletRate Uncertainty in the expected doublet rate.
26
-#'  Default 0.02.
27
-#' @param syntheticDoubletUmiSubsampling Numeric. Rate for sampling UMIs
28
-#'  when creating synthetic doublets. If 1.0, each doublet is created by simply
29
-#'  adding the UMIs from two randomly sampled observed transcriptomes. For
30
-#'  values less than 1, the UMI counts are added and then randomly sampled at
31
-#'  the specified rate. Defuault: 1.0.
32
-#' @param useApproxNeighbors Boolean. Use approximate nearest neighbor method
33
-#'  (\code{annoy}) for the KNN classifier. Default \code{TRUE}.
34
-#' @param distanceMetric Character. Distance metric used when finding nearest
35
-#'  neighbors.
36
-#'  For list of valid values, see the documentation for \code{annoy} (if
37
-#'  \code{useApproxNeighbors} is \code{TRUE}) or
38
-#'  \code{sklearn.neighbors.NearestNeighbors} (if \code{useApproxNeighbors} is
39
-#'  \code{FALSE}). Default "euclidean".
40
-#' @param getDoubletNeighborParents Boolean. If \code{TRUE}, return the
41
-#'  parent transcriptomes that generated the doublet neighbors of each
42
-#'  observed transcriptome. This information can be used to infer the cell
43
-#'  states that generated a given doublet state. Default \code{FALSE}.
44
-#' @param minCounts Numeric. Used for gene filtering prior to PCA. Genes
45
-#'  expressed at fewer than \code{minCounts} in fewer than \code{minCells}
46
-#'  (see below) are excluded. Default 3.
47
-#' @param minCells Integer. Used for gene filtering prior to PCA. Genes
48
-#'  expressed at fewer than \code{minCounts} (see above) in fewer than
49
-#'  \code{minCells} are excluded. Default 3.
50
-#' @param minGeneVariabilityPctl Numeric. Used for gene filtering prior to
51
-#'  PCA. Keep the most highly variable genes (in the top
52
-#'  minGeneVariabilityPctl percentile), as measured by the v-statistic
53
-#'  (\emph{Klein et al., Cell 2015}). Default 85.
54
-#' @param logTransform Boolean. If \code{TRUE}, log-transform the counts matrix
55
-#'  (log10(1+TPM)). \code{sklearn.decomposition.TruncatedSVD} will be used for
56
-#'  dimensionality reduction, unless \code{meanCenter} is \code{TRUE}.
57
-#'  Default \code{FALSE}.
58
-#' @param meanCenter If \code{TRUE}, center the data such that each gene has a
59
-#'  mean of 0. \code{sklearn.decomposition.PCA} will be used for
60
-#'  dimensionality reduction. Default \code{TRUE}.
61
-#' @param normalizeVariance Boolean. If \code{TRUE}, normalize the data such
62
-#'  that each gene has a variance of 1.
63
-#'  \code{sklearn.decomposition.TruncatedSVD} will be used for dimensionality
64
-#'  reduction, unless \code{meanCenter} is \code{TRUE}. Default \code{TRUE}.
65
-#' @param nPrinComps Integer. Number of principal components used to embed
66
-#'  the transcriptomes prior to k-nearest-neighbor graph construction.
67
-#'  Default 30.
68
-#' @param tsneAngle Float. Determines angular size of a distant node as measured
69
-#'  from a point in the t-SNE plot. If default, it is set to 0.5 Default \code{NULL}.
70
-#' @param tsnePerplexity Integer. The number of nearest neighbors that
71
-#'  is used in other manifold learning algorithms.
72
-#'  If default, it is set to 30. Default \code{NULL}.
73
-#' @param verbose Boolean. If \code{TRUE}, print progress updates. Default
74
-#'  \code{TRUE}.
75
-#' @param seed Seed for the random number generator. Default 12345.
76
-#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object with
77
-#'  \code{scrub_doublets} output appended to the
78
-#'  \link[SummarizedExperiment]{colData} slot. The columns include
79
-#'  \emph{scrublet_score} and \emph{scrublet_call}.
80
-#' @examples
81
-#' data(scExample, package = "singleCellTK")
82
-#' sce <- sce[, colData(sce)$type != 'EmptyDroplet']
83
-#' sce <- runScrublet(sce)
84
-#' @export
85
-#' @importFrom reticulate py_module_available py_set_seed import
86
-#' @importFrom SummarizedExperiment colData colData<-
87
-#' @importFrom SingleCellExperiment reducedDim
88
-runScrublet <- function(inSCE,
89
-  sample = NULL,
90
-  useAssay = "counts",
91
-  simDoubletRatio = 2.0,
92
-  nNeighbors = NULL,
93
-  minDist = NULL,
94
-  expectedDoubletRate = 0.1,
95
-  stdevDoubletRate = 0.02,
96
-  syntheticDoubletUmiSubsampling = 1.0,
97
-  useApproxNeighbors = TRUE,
98
-  distanceMetric = "euclidean",
99
-  getDoubletNeighborParents = FALSE,
100
-  minCounts = 3,
101
-  minCells = 3L,
102
-  minGeneVariabilityPctl = 85,
103
-  logTransform = FALSE,
104
-  meanCenter = TRUE,
105
-  normalizeVariance = TRUE,
106
-  nPrinComps = 30L,
107
-  tsneAngle = NULL,
108
-  tsnePerplexity = NULL,
109
-  verbose = TRUE,
110
-  seed = 12345) {
111
-
112
-  if (!reticulate::py_module_available(module = "scrublet")) {
113
-    warning("Cannot find python module 'scrublet', please install Conda and",
114
-      " run sctkPythonInstallConda() or run sctkPythonInstallVirtualEnv().",
115
-      "If one of these have been previously run to install the modules,",
116
-      "make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(),",
117
-      " respectively, if R has been restarted since the module installation.",
118
-      " Alternatively, Scrublet can be installed on the local machine",
119
-      "with pip (e.g. pip install scrublet) and then the 'use_python()'",
120
-      " function from the 'reticulate' package can be used to select the",
121
-      " correct Python environment.")
122
-    return(inSCE)
123
-  }
124
-
125
-  if (!is.null(seed)) {
126
-    reticulate::py_set_seed(seed = seed)
127
-  }
128
-
129
-  if (!is.null(sample)) {
130
-    if (length(sample) != ncol(inSCE)) {
131
-      stop("'sample' must be the same length as the number of",
132
-        " columns in 'inSCE'")
133
-    }
134
-  } else {
135
-    sample = rep(1, ncol(inSCE))
136
-  }
137
-
138
-  message(paste0(date(), " ... Running 'scrublet'"))
139
-
140
-  ##  Getting current arguments values
141
-  #argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame()))
142
-  argsList <- mget(names(formals()),sys.frame(sys.nframe()))
143
-
144
-  ## Define result matrix for all samples
145
-  output <- S4Vectors::DataFrame(row.names = colnames(inSCE),
146
-    scrublet_score = numeric(ncol(inSCE)),
147
-    scrublet_call = logical(ncol(inSCE)))
148
-
149
-  ## Loop through each sample and run scrublet
150
-  error <- try({
151
-    samples <- unique(sample)
152
-    umapDims <- matrix(ncol = 2,
153
-                       nrow = ncol(inSCE))
154
-    rownames(umapDims) = colnames(inSCE)
155
-    colnames(umapDims) = c("UMAP_1", "UMAP_2")
156
-
157
-    tsneDims <- matrix(ncol = 2,
158
-                       nrow = ncol(inSCE))
159
-    rownames(tsneDims) = colnames(inSCE)
160
-    colnames(tsneDims) = c("TSNE_1", "TSNE_2")
161
-
162
-    for (i in seq_len(length(samples))) {
163
-      sceSampleInd <- sample == samples[i]
164
-      sceSample <- inSCE[, sceSampleInd]
165
-
166
-      mat <- SummarizedExperiment::assay(sceSample, i = useAssay)
167
-      mat <- .convertToMatrix(mat)
168
-
169
-      scr <- scrublet$Scrublet(counts_matrix = t(mat),
170
-        sim_doublet_ratio = simDoubletRatio,
171
-        n_neighbors = nNeighbors,
172
-        expected_doublet_rate = expectedDoubletRate,
173
-        stdev_doublet_rate = stdevDoubletRate)
174
-
175
-      result <- scr$scrub_doublets(
176
-        synthetic_doublet_umi_subsampling = syntheticDoubletUmiSubsampling,
177
-        use_approx_neighbors = useApproxNeighbors,
178
-        distance_metric = distanceMetric,
179
-        get_doublet_neighbor_parents = getDoubletNeighborParents,
180
-        min_counts = minCounts,
181
-        min_cells = as.integer(minCells),
182
-        min_gene_variability_pctl = minGeneVariabilityPctl,
183
-        log_transform = logTransform,
184
-        mean_center = meanCenter,
185
-        normalize_variance = normalizeVariance,
186
-        n_prin_comps = as.integer(nPrinComps),
187
-        verbose = verbose)
188
-
189
-      output[sceSampleInd, "scrublet_score"] <- result[[1]]
190
-      output[sceSampleInd, "scrublet_call"] <- result[[2]]
191
-
192
-      ## Extract UMAP and TSNE coordinates
193
-      if (is.null(nNeighbors) && is.null(minDist)){
194
-        umap_coordinates <- scrublet$get_umap(scr$manifold_obs_)
195
-      }else {
196
-        umap_coordinates <- scrublet$get_umap(scr$manifold_obs_,
197
-                                              n_neighbors=as.integer(nNeighbors),
198
-                                              min_dist=minDist)
199
-      }
200
-      umapDims[sceSampleInd, ] <- umap_coordinates
201
-
202
-    if (is.null(tsneAngle) && is.null(tsnePerplexity)){
203
-      tsne_coordinates <- scrublet$get_tsne(scr$manifold_obs_)
204
-    }else {
205
-      tsne_coordinates <- scrublet$get_tsne(scr$manifold_obs_,
206
-                                            angle=tsneAngle,
207
-                                            perplexity=as.integer(tsnePerplexity))
208
-    }
209
-    tsneDims[sceSampleInd, ] <- tsne_coordinates
210
-
211
-  }
212
-
213
-    colData(inSCE) = cbind(colData(inSCE), output)
214
-  }, silent = TRUE)
215
-
216
-  if (inherits(error, "try-error")) {
217
-    warning("Scrublet did not complete successfully. Returning SCE without",
218
-      " making any changes. Error given by Scrublet: \n\n", error)
219
-  }
220
-
221
-  inSCE@metadata$runScrublet <- argsList[-1]
222
-
223
-  ## add scrublet version to metadata
224
-  version <- pkg_resources$require("scrublet")[[1]]
225
-  inSCE@metadata$scrublet$packageVersion <- version
226
-  reducedDim(inSCE,'scrublet_TSNE') <- tsneDims
227
-  reducedDim(inSCE,'scrublet_UMAP') <- umapDims
228
-
229
-  return(inSCE)
230
-}
231
-
1
+#' @title Find doublets using \code{scrublet}.
2
+#' @description A wrapper function that calls \code{scrub_doublets} from python
3
+#'  module \code{scrublet}. Simulates doublets from the observed data and uses
4
+#'  a k-nearest-neighbor classifier to calculate a continuous
5
+#'  \code{scrublet_score} (between 0 and 1) for each transcriptome. The score
6
+#'  is automatically thresholded to generate \code{scrublet_call}, a boolean
7
+#'  array that is \code{TRUE} for predicted doublets and \code{FALSE}
8
+#'  otherwise.
9
+#' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object.
10
+#'  Needs \code{counts} in assays slot.
11
+#' @param sample Character vector. Indicates which sample each cell belongs to.
12
+#'  Scrublet will be run on cells from each sample separately. If NULL, then
13
+#'  all cells will be processed together. Default \code{NULL}.
14
+#' @param useAssay  A string specifying which assay in the SCE to use. Default
15
+#'  'counts'.
16
+#' @param simDoubletRatio Numeric. Number of doublets to simulate relative to
17
+#'  the number of observed transcriptomes. Default 2.0.
18
+#' @param nNeighbors Integer. Number of neighbors used to construct the KNN
19
+#'  graph of observed transcriptomes and simulated doublets. If \code{NULL},
20
+#'  this is set to \code{round(0.5 * sqrt(n_cells))}. Default \code{NULL}.
21
+#' @param minDist Float Determines how tightly UMAP packs points together. If \code{NULL},
22
+#'  this is set to 0.1. Default \code{NULL}.
23
+#' @param expectedDoubletRate The estimated doublet rate for the experiment.
24
+#'  Default 0.1.
25
+#' @param stdevDoubletRate Uncertainty in the expected doublet rate.
26
+#'  Default 0.02.
27
+#' @param syntheticDoubletUmiSubsampling Numeric. Rate for sampling UMIs
28
+#'  when creating synthetic doublets. If 1.0, each doublet is created by simply
29
+#'  adding the UMIs from two randomly sampled observed transcriptomes. For
30
+#'  values less than 1, the UMI counts are added and then randomly sampled at
31
+#'  the specified rate. Defuault: 1.0.
32
+#' @param useApproxNeighbors Boolean. Use approximate nearest neighbor method
33
+#'  (\code{annoy}) for the KNN classifier. Default \code{TRUE}.
34
+#' @param distanceMetric Character. Distance metric used when finding nearest
35
+#'  neighbors.
36
+#'  For list of valid values, see the documentation for \code{annoy} (if
37
+#'  \code{useApproxNeighbors} is \code{TRUE}) or
38
+#'  \code{sklearn.neighbors.NearestNeighbors} (if \code{useApproxNeighbors} is
39
+#'  \code{FALSE}). Default "euclidean".
40
+#' @param getDoubletNeighborParents Boolean. If \code{TRUE}, return the
41
+#'  parent transcriptomes that generated the doublet neighbors of each
42
+#'  observed transcriptome. This information can be used to infer the cell
43
+#'  states that generated a given doublet state. Default \code{FALSE}.
44
+#' @param minCounts Numeric. Used for gene filtering prior to PCA. Genes
45
+#'  expressed at fewer than \code{minCounts} in fewer than \code{minCells}
46
+#'  (see below) are excluded. Default 3.
47
+#' @param minCells Integer. Used for gene filtering prior to PCA. Genes
48
+#'  expressed at fewer than \code{minCounts} (see above) in fewer than
49
+#'  \code{minCells} are excluded. Default 3.
50
+#' @param minGeneVariabilityPctl Numeric. Used for gene filtering prior to
51
+#'  PCA. Keep the most highly variable genes (in the top
52
+#'  minGeneVariabilityPctl percentile), as measured by the v-statistic
53
+#'  (\emph{Klein et al., Cell 2015}). Default 85.
54
+#' @param logTransform Boolean. If \code{TRUE}, log-transform the counts matrix
55
+#'  (log10(1+TPM)). \code{sklearn.decomposition.TruncatedSVD} will be used for
56
+#'  dimensionality reduction, unless \code{meanCenter} is \code{TRUE}.
57
+#'  Default \code{FALSE}.
58
+#' @param meanCenter If \code{TRUE}, center the data such that each gene has a
59
+#'  mean of 0. \code{sklearn.decomposition.PCA} will be used for
60
+#'  dimensionality reduction. Default \code{TRUE}.
61
+#' @param normalizeVariance Boolean. If \code{TRUE}, normalize the data such
62
+#'  that each gene has a variance of 1.
63
+#'  \code{sklearn.decomposition.TruncatedSVD} will be used for dimensionality
64
+#'  reduction, unless \code{meanCenter} is \code{TRUE}. Default \code{TRUE}.
65
+#' @param nPrinComps Integer. Number of principal components used to embed
66
+#'  the transcriptomes prior to k-nearest-neighbor graph construction.
67
+#'  Default 30.
68
+#' @param tsneAngle Float. Determines angular size of a distant node as measured
69
+#'  from a point in the t-SNE plot. If default, it is set to 0.5 Default \code{NULL}.
70
+#' @param tsnePerplexity Integer. The number of nearest neighbors that
71
+#'  is used in other manifold learning algorithms.
72
+#'  If default, it is set to 30. Default \code{NULL}.
73
+#' @param verbose Boolean. If \code{TRUE}, print progress updates. Default
74
+#'  \code{TRUE}.
75
+#' @param seed Seed for the random number generator. Default 12345.
76
+#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object with
77
+#'  \code{scrub_doublets} output appended to the
78
+#'  \link[SummarizedExperiment]{colData} slot. The columns include
79
+#'  \emph{scrublet_score} and \emph{scrublet_call}.
80
+#' @examples
81
+#' data(scExample, package = "singleCellTK")
82
+#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
83
+#' sce <- runScrublet(sce)
84
+#' @export
85
+#' @importFrom reticulate py_module_available py_set_seed import
86
+#' @importFrom SummarizedExperiment colData colData<-
87
+#' @importFrom SingleCellExperiment reducedDim
88
+runScrublet <- function(inSCE,
89
+  sample = NULL,
90
+  useAssay = "counts",
91
+  simDoubletRatio = 2.0,
92
+  nNeighbors = NULL,
93
+  minDist = NULL,
94
+  expectedDoubletRate = 0.1,
95
+  stdevDoubletRate = 0.02,
96
+  syntheticDoubletUmiSubsampling = 1.0,
97
+  useApproxNeighbors = TRUE,
98
+  distanceMetric = "euclidean",
99
+  getDoubletNeighborParents = FALSE,
100
+  minCounts = 3,
101
+  minCells = 3L,
102
+  minGeneVariabilityPctl = 85,
103
+  logTransform = FALSE,
104
+  meanCenter = TRUE,
105
+  normalizeVariance = TRUE,
106
+  nPrinComps = 30L,
107
+  tsneAngle = NULL,
108
+  tsnePerplexity = NULL,
109
+  verbose = TRUE,
110
+  seed = 12345) {
111
+
112
+  if (!reticulate::py_module_available(module = "scrublet")) {
113
+    warning("Cannot find python module 'scrublet', please install Conda and",
114
+      " run sctkPythonInstallConda() or run sctkPythonInstallVirtualEnv().",
115
+      "If one of these have been previously run to install the modules,",
116
+      "make sure to run selectSCTKConda() or selectSCTKVirtualEnvironment(),",
117
+      " respectively, if R has been restarted since the module installation.",
118
+      " Alternatively, Scrublet can be installed on the local machine",
119
+      "with pip (e.g. pip install scrublet) and then the 'use_python()'",
120
+      " function from the 'reticulate' package can be used to select the",
121
+      " correct Python environment.")
122
+    return(inSCE)
123
+  }
124
+
125
+  if (!is.null(seed)) {
126
+    reticulate::py_set_seed(seed = seed)
127
+  }
128
+
129
+  if (!is.null(sample)) {
130
+    if (length(sample) != ncol(inSCE)) {
131
+      stop("'sample' must be the same length as the number of",
132
+        " columns in 'inSCE'")
133
+    }
134
+  } else {
135
+    sample = rep(1, ncol(inSCE))
136
+  }
137
+
138
+  message(paste0(date(), " ... Running 'scrublet'"))
139
+
140
+  ##  Getting current arguments values
141
+  #argsList <- as.list(formals(fun = sys.function(sys.parent()), envir = parent.frame()))
142
+  argsList <- mget(names(formals()),sys.frame(sys.nframe()))
143
+
144
+  ## Define result matrix for all samples
145
+  output <- S4Vectors::DataFrame(row.names = colnames(inSCE),
146
+    scrublet_score = numeric(ncol(inSCE)),
147
+    scrublet_call = logical(ncol(inSCE)))
148
+
149
+  ## Loop through each sample and run scrublet
150
+  error <- try({
151
+    samples <- unique(sample)
152
+    umapDims <- matrix(ncol = 2,
153
+                       nrow = ncol(inSCE))
154
+    rownames(umapDims) = colnames(inSCE)
155
+    colnames(umapDims) = c("UMAP_1", "UMAP_2")
156
+
157
+    tsneDims <- matrix(ncol = 2,
158
+                       nrow = ncol(inSCE))
159
+    rownames(tsneDims) = colnames(inSCE)
160
+    colnames(tsneDims) = c("TSNE_1", "TSNE_2")
161
+
162
+    for (i in seq_len(length(samples))) {
163
+      sceSampleInd <- sample == samples[i]
164
+      sceSample <- inSCE[, sceSampleInd]
165
+
166
+      mat <- SummarizedExperiment::assay(sceSample, i = useAssay)
167
+      mat <- .convertToMatrix(mat)
168
+
169
+      scr <- scrublet$Scrublet(counts_matrix = t(mat),
170
+        sim_doublet_ratio = simDoubletRatio,
171
+        n_neighbors = nNeighbors,
172
+        expected_doublet_rate = expectedDoubletRate,
173
+        stdev_doublet_rate = stdevDoubletRate)
174
+
175
+      result <- scr$scrub_doublets(
176
+        synthetic_doublet_umi_subsampling = syntheticDoubletUmiSubsampling,
177
+        use_approx_neighbors = useApproxNeighbors,
178
+        distance_metric = distanceMetric,
179
+        get_doublet_neighbor_parents = getDoubletNeighborParents,
180
+        min_counts = minCounts,
181
+        min_cells = as.integer(minCells),
182
+        min_gene_variability_pctl = minGeneVariabilityPctl,
183
+        log_transform = logTransform,
184
+        mean_center = meanCenter,
185
+        normalize_variance = normalizeVariance,
186
+        n_prin_comps = as.integer(nPrinComps),
187
+        verbose = verbose)
188
+
189
+      output[sceSampleInd, "scrublet_score"] <- result[[1]]
190
+      output[sceSampleInd, "scrublet_call"] <- result[[2]]
191
+
192
+      ## Extract UMAP and TSNE coordinates
193
+      if (is.null(nNeighbors) && is.null(minDist)){
194
+        umap_coordinates <- scrublet$get_umap(scr$manifold_obs_)
195
+      }else {
196
+        umap_coordinates <- scrublet$get_umap(scr$manifold_obs_,
197
+                                              n_neighbors=as.integer(nNeighbors),
198
+                                              min_dist=minDist)
199
+      }
200
+      umapDims[sceSampleInd, ] <- umap_coordinates
201
+
202
+    if (is.null(tsneAngle) && is.null(tsnePerplexity)){
203
+      tsne_coordinates <- scrublet$get_tsne(scr$manifold_obs_)
204
+    }else {
205
+      tsne_coordinates <- scrublet$get_tsne(scr$manifold_obs_,
206
+                                            angle=tsneAngle,
207
+                                            perplexity=as.integer(tsnePerplexity))
208
+    }
209
+    tsneDims[sceSampleInd, ] <- tsne_coordinates
210
+
211
+  }
212
+
213
+    colData(inSCE) = cbind(colData(inSCE), output)
214
+  }, silent = TRUE)
215
+
216
+  if (inherits(error, "try-error")) {
217
+    warning("Scrublet did not complete successfully. Returning SCE without",
218
+      " making any changes. Error given by Scrublet: \n\n", error)
219
+  }
220
+
221
+  inSCE@metadata$runScrublet <- argsList[-1]
222
+
223
+  ## add scrublet version to metadata
224
+  version <- pkg_resources$require("scrublet")[[1]]
225
+  inSCE@metadata$scrublet$packageVersion <- version
226
+  reducedDim(inSCE,'scrublet_TSNE') <- tsneDims
227
+  reducedDim(inSCE,'scrublet_UMAP') <- umapDims
228
+
229
+  return(inSCE)
230
+}
231
+
... ...
@@ -73,7 +73,7 @@ dimension reduction.
73 73
 }
74 74
 \examples{
75 75
 data(scExample, package = "singleCellTK")
76
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
76
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
77 77
 umap_res <- getUMAP(inSCE = sce, useAssay = "counts",
78 78
                     reducedDimName = "UMAP", logNorm = TRUE,
79 79
                     nNeighbors = 30, alpha = 1,
... ...
@@ -141,7 +141,7 @@ Plot heatmap of using data stored in SingleCellExperiment Object
141 141
 }
142 142
 \examples{
143 143
 data(scExample, package = "singleCellTK")
144
-plotSCEHeatmap(sce[1:3,1:3])
144
+plotSCEHeatmap(sce[1:3,1:3], useAssay = "counts")
145 145
 }
146 146
 \author{
147 147
 Yichen Wang
... ...
@@ -37,7 +37,7 @@ Plot UMAP results either on already run results or run first and then plot.
37 37
 }
38 38
 \examples{
39 39
 data(scExample, package = "singleCellTK")
40
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
40
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
41 41
 sce <- getUMAP(inSCE = sce, useAssay = "counts", reducedDimName = "UMAP")
42 42
 plotUMAP(sce, shape = "No Shape", reducedDimName = "UMAP",
43 43
          runUMAP = TRUE, useAssay = "counts")
... ...
@@ -32,7 +32,7 @@ A  function to generate .html Rmarkdown report containing the visualizations of
32 32
 }
33 33
 \examples{
34 34
 data(scExample, package = "singleCellTK")
35
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
35
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
36 36
 \dontrun{
37 37
 sce <- runCellQC(sce)
38 38
 reportCellQC(inSCE = sce)
... ...
@@ -28,10 +28,3 @@ Default \code{NULL}.}
28 28
 A  function to generate .html Rmarkdown report containing the
29 29
 visualizations of the \code{\link{runDEAnalysis}} function output
30 30
 }
31
-\examples{
32
-data(scExample, package = "singleCellTK")
33
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
34
-sce <- runDEAnalysis(inSCE = sce, groupName1 = "Sample1", method = "DESeq2",
35
- groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "DESeq2")
36
-reportDiffExp(sce, study = "DESeq2", output_file = "DESeq2_res")
37
-}
... ...
@@ -32,7 +32,7 @@ A  function to generate .html Rmarkdown report for the specified QC algorithm ou
32 32
 }
33 33
 \examples{
34 34
 data(scExample, package = "singleCellTK")
35
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
35
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
36 36
 \donttest{
37 37
 sce <- runDecontX(sce)
38 38
 sce <- getUMAP(sce)
... ...
@@ -94,7 +94,7 @@ only.
94 94
 }
95 95
 \examples{
96 96
 data(scExample, package = "singleCellTK")
97
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
97
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
98 98
 sce@assays@data$logcounts <- log10(counts(sce) + 1)
99 99
 sce <- runANOVA(inSCE = sce, groupName1 = "Sample1",
100 100
  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40,
... ...
@@ -60,6 +60,6 @@ A wrapper function for \link[scds]{bcds}. Annotate
60 60
 }
61 61
 \examples{
62 62
 data(scExample, package = "singleCellTK")
63
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
63
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
64 64
 sce <- runBcds(sce)
65 65
 }
... ...
@@ -53,6 +53,6 @@ A wrapper function for \link[scds]{cxds}. Annotate
53 53
 }
54 54
 \examples{
55 55
 data(scExample, package = "singleCellTK")
56
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
56
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
57 57
 sce <- runCxds(sce)
58 58
 }
... ...
@@ -59,6 +59,6 @@ A wrapper function for \link[scds]{cxds_bcds_hybrid}. Annotate
59 59
 }
60 60
 \examples{
61 61
 data(scExample, package = "singleCellTK")
62
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
62
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
63 63
 sce <- runCxdsBcdsHybrid(sce)
64 64
 }
... ...
@@ -26,7 +26,7 @@ Method supported: 'MAST', 'DESeq2', 'Limma', 'ANOVA'
26 26
 }
27 27
 \examples{
28 28
 data(scExample, package = "singleCellTK")
29
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
29
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
30 30
 sce <- runDEAnalysis(inSCE = sce, groupName1 = "Sample1", method = "DESeq2",
31 31
  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "DESeq2")
32 32
 }
... ...
@@ -94,7 +94,7 @@ used.
94 94
 }
95 95
 \examples{
96 96
 data(scExample, package = "singleCellTK")
97
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
97
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
98 98
 sce <- runDESeq2(inSCE = sce, groupName1 = "Sample1",
99 99
  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "DESeq2")
100 100
 
... ...
@@ -96,6 +96,6 @@ A wrapper function for \link[celda]{decontX}. Identify
96 96
 }
97 97
 \examples{
98 98
 data(scExample, package = "singleCellTK")
99
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
99
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
100 100
 sce <- runDecontX(sce)
101 101
 }
... ...
@@ -93,7 +93,7 @@ This function is a wrapper function for \link[scran]{doubletCells}.
93 93
 }
94 94
 \examples{
95 95
 data(scExample, package = "singleCellTK")
96
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
96
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
97 97
 sce <- runDoubletCells(sce)
98 98
 }
99 99
 \references{
... ...
@@ -54,6 +54,6 @@ Uses doubletFinder to determine cells within the dataset
54 54
 }
55 55
 \examples{
56 56
 data(scExample, package = "singleCellTK")
57
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
57
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
58 58
 sce <- runDoubletFinder(sce)
59 59
 }
... ...
@@ -90,7 +90,7 @@ used.
90 90
 }
91 91
 \examples{
92 92
 data(scExample, package = "singleCellTK")
93
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
93
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
94 94
 sce@assays@data$logcounts <- log10(counts(sce) + 1)
95 95
 sce <- runLimmaDE(inSCE = sce, groupName1 = "Sample1",
96 96
  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "Limma")
... ...
@@ -90,7 +90,7 @@ used.
90 90
 }
91 91
 \examples{
92 92
 data(scExample, package = "singleCellTK")
93
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
93
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
94 94
 sce@assays@data$logcounts <- log10(counts(sce) + 1)
95 95
 sce <- runMAST(inSCE = sce, groupName1 = "Sample1",
96 96
  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40, analysisName = "MAST")
... ...
@@ -138,6 +138,6 @@ A wrapper function that calls \code{scrub_doublets} from python
138 138
 }
139 139
 \examples{
140 140
 data(scExample, package = "singleCellTK")
141
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
141
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
142 142
 sce <- runScrublet(sce)
143 143
 }
... ...
@@ -24,6 +24,6 @@ Plot QC metrics generated from QC algorithms via either kable or csv file.
24 24
 }
25 25
 \examples{
26 26
 data(scExample, package = "singleCellTK")
27
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
27
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
28 28
 sampleSummaryStats(sce, simple = TRUE)
29 29
 }
... ...
@@ -2,7 +2,7 @@
2 2
 library(singleCellTK)
3 3
 context("Testing decontamination algorithms")
4 4
 data(scExample, package = "singleCellTK")
5
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
5
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
6 6
 
7 7
 test_that(desc = "Testing runDecontX", {
8 8
         sceres <- runDecontX(sce)
... ...
@@ -3,16 +3,16 @@ context("Testing mergeSCColData")
3 3
 
4 4
 data(scExample, package = "singleCellTK")
5 5
 
6
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
6
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
7 7
 
8 8
 colData(sce)$column_name = rownames(colData(sce))
9 9
 test_that(desc = "Testing mergeSCEColData", {
10 10
     sce2 <- sce
11 11
     colData(sce2)$test <- 0
12
-    
12
+
13 13
     #test again
14 14
     mergedsce <- mergeSCEColData(sce, sce2)
15
-    
15
+
16 16
     expect_equal(ncol(colData(sce)) + 1, ncol(colData(mergedsce)))
17 17
 })
18 18
 
... ...
@@ -1,7 +1,7 @@
1 1
 context("misc functions")
2 2
 
3 3
 data(scExample, package = "singleCellTK")
4
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
4
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
5 5
 
6 6
 test_that("summarizeSCE", {
7 7
   ta <- summarizeSCE(sce, sample = NULL)
... ...
@@ -3,7 +3,7 @@ library(singleCellTK)
3 3
 context("Testing dimensionality reduction algorithms")
4 4
 data(scExample, package = "singleCellTK")
5 5
 sceDroplet <- sce
6
-sce <- sce[, colData(sce)$type != 'EmptyDroplet']
6
+sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
7 7
 sampleVector <- c(rep("Sample1", 100), rep("Sample2", 95))
8 8
 sceres <- getUMAP(inSCE = sce, useAssay = "counts", logNorm = TRUE, sample = sampleVector, nNeighbors = 10, reducedDimName = "UMAP",
9 9
                 nIterations = 20, alpha = 1, minDist = 0.01, pca = TRUE, initialDims = 20)