Browse code

Merge pull request #31 from compbiomed/devel

Devel

mingl1997 authored on 28/10/2024 16:00:42 • GitHub committed on 28/10/2024 16:00:42
Showing 4 changed files

... ...
@@ -271,6 +271,7 @@ import(GSVAdata)
271 271
 import(SingleCellExperiment)
272 272
 import(eds)
273 273
 importFrom(BiocParallel,SerialParam)
274
+importFrom(ComplexHeatmap,anno_barplot)
274 275
 importFrom(S4Vectors,"metadata<-")
275 276
 importFrom(S4Vectors,metadata)
276 277
 importFrom(SingleCellExperiment,"counts<-")
... ...
@@ -317,8 +318,11 @@ importFrom(stats,prcomp)
317 318
 importFrom(stats,quantile)
318 319
 importFrom(stringr,str_c)
319 320
 importFrom(stringr,str_replace_all)
321
+importFrom(tibble,column_to_rownames)
322
+importFrom(tibble,remove_rownames)
320 323
 importFrom(tibble,tibble)
321 324
 importFrom(tidyr,spread)
325
+importFrom(tidyr,unite)
322 326
 importFrom(tools,file_ext)
323 327
 importFrom(utils,head)
324 328
 importFrom(utils,packageVersion)
... ...
@@ -25,6 +25,11 @@
25 25
 #' @param cellIndexBy A single character specifying a column name of
26 26
 #' \code{colData(inSCE)}, or a vector of the same length as \code{ncol(inSCE)},
27 27
 #' where we search for the non-rowname cell indices. Default \code{"rownames"}.
28
+#' @param cluster_columns A logical scalar that turns on/off 
29
+#' clustering of columns. Default \code{FALSE}. Clustering columns should be turned off when using reduced dim 
30
+#' for plotting as it will be sorted by PCs
31
+#' @param cluster_rows A logical scalar that turns on/off clustering of rows. 
32
+#' Default \code{FALSE}.
28 33
 #' @param rowDataName character. The column name(s) in \code{rowData} that need
29 34
 #' to be added to the annotation. Not applicable for
30 35
 #' \code{plotSCEDimReduceHeatmap}. Default \code{NULL}.
... ...
@@ -103,7 +108,8 @@
103 108
 #' @importFrom stringr str_replace_all str_c
104 109
 #' @importFrom stats prcomp quantile
105 110
 #' @importFrom dplyr select arrange group_by count ungroup mutate one_of desc
106
-#' @importFrom tidyr spread unite column_to_rownames remove_rownames
111
+#' @importFrom tidyr spread unite 
112
+#' @importFrom tibble column_to_rownames remove_rownames
107 113
 #' @importFrom grid gpar
108 114
 #' @importFrom ComplexHeatmap anno_barplot
109 115
 #' @importFrom rlang .data
... ...
@@ -113,6 +119,8 @@ plotSCEHeatmap <- function(inSCE, useAssay = 'logcounts', useReducedDim = NULL,
113 119
                            scale = TRUE, trim = c(-2,2),
114 120
                            featureIndexBy = 'rownames',
115 121
                            cellIndexBy = 'rownames',
122
+                           cluster_columns = FALSE,
123
+                           cluster_rows = FALSE,
116 124
                            rowDataName = NULL, colDataName = NULL,
117 125
                            aggregateRow = NULL, aggregateCol = NULL,
118 126
                            featureAnnotations = NULL, cellAnnotations = NULL,
... ...
@@ -282,8 +290,8 @@ plotSCEHeatmap <- function(inSCE, useAssay = 'logcounts', useReducedDim = NULL,
282 290
     temp_df<-as.data.frame(colData(SCE)[,c(aggregateCol),drop=FALSE]) %>% 
283 291
       unite("new_colnames",1:ncol(.),sep = "_",remove = FALSE) %>% 
284 292
       remove_rownames() %>% 
285
-      mutate(aggregated_column = new_colnames) %>%
286
-      dplyr::select(new_colnames, aggregated_column) %>%
293
+    #  mutate(aggregated_column = new_colnames) %>%
294
+    #  dplyr::select(new_colnames, aggregated_column) %>%
287 295
       column_to_rownames("new_colnames")
288 296
 
289 297
     colData(SCE)<-DataFrame(temp_df)
... ...
@@ -446,7 +454,8 @@ plotSCEHeatmap <- function(inSCE, useAssay = 'logcounts', useReducedDim = NULL,
446 454
                                 show_row_dend = rowDend,
447 455
                                 show_column_dend = colDend,
448 456
                                 row_dend_reorder = TRUE,
449
-                                cluster_columns = FALSE,
457
+                                cluster_columns = cluster_columns,
458
+                                cluster_rows = cluster_rows,
450 459
                                 show_column_names = colLabel,
451 460
                                 column_names_gp = grid::gpar(fontsize = colLabelSize),
452 461
                                 row_gap = rowGap, column_gap = colGap,
... ...
@@ -15,6 +15,8 @@ plotSCEHeatmap(
15 15
   trim = c(-2, 2),
16 16
   featureIndexBy = "rownames",
17 17
   cellIndexBy = "rownames",
18
+  cluster_columns = FALSE,
19
+  cluster_rows = FALSE,
18 20
   rowDataName = NULL,
19 21
   colDataName = NULL,
20 22
   aggregateRow = NULL,
... ...
@@ -65,8 +67,8 @@ another feature list indicated by \code{featureIndexBy}. Default \code{NULL}.}
65 67
 (cells). Alternatively, it can be a vector identifying cells in another
66 68
 cell list indicated by \code{featureIndexBy}. Default \code{NULL}.}
67 69
 
68
-\item{scale}{Whether to perform z-score scaling on each row. Default
69
-\code{TRUE}.}
70
+\item{scale}{Whether to perform z-score or min-max scaling on each row.Choose from \code{"zscore"}, \code{"min-max"} or default
71
+\code{TRUE} or \code{FALSE}}
70 72
 
71 73
 \item{trim}{A 2-element numeric vector. Values outside of this range will be
72 74
 trimmed to their nearst bound. Default \code{c(-2, 2)}}
... ...
@@ -80,6 +82,13 @@ where we search for the non-rowname feature indices. Not applicable for
80 82
 \code{colData(inSCE)}, or a vector of the same length as \code{ncol(inSCE)},
81 83
 where we search for the non-rowname cell indices. Default \code{"rownames"}.}
82 84
 
85
+\item{cluster_columns}{A logical scalar that turns on/off 
86
+clustering of columns. Default \code{FALSE}. Clustering columns should be turned off when using reduced dim 
87
+for plotting as it will be sorted by PCs}
88
+
89
+\item{cluster_rows}{A logical scalar that turns on/off clustering of rows. 
90
+Default \code{FALSE}.}
91
+
83 92
 \item{rowDataName}{character. The column name(s) in \code{rowData} that need
84 93
 to be added to the annotation. Not applicable for
85 94
 \code{plotSCEDimReduceHeatmap}. Default \code{NULL}.}
... ...
@@ -207,34 +207,56 @@ Other heatmap settings will also be automatically filled for a DE specific heatm
207 207
 <div id="console" class="tabcontent">
208 208
 ````
209 209
 
210
-To present the usage of `plotSCEHeatmap()`, we would like to use a small example provided with SCTK.  
210
+To present the usage of `plotSCEHeatmap()`, we would like to use a small example provided with SCTK. 
211
+
212
+**"Raw" plotting**
213
+
214
+The minimum setting for `plotSCEHeatmap()` is the input SCE object and the data matrix to plot (default `"logcounts"`). In this way, all cells and features will be presented while no annotation or legend (except the main color scheme) will be shown.  
211 215
 
212 216
 ```{R setup, eval=TRUE, message=FALSE, cache=TRUE}
213 217
 library(singleCellTK)
214 218
 data("scExample") # This imports SCE object "sce"
215 219
 sce
216
-```
217 220
 
218
-**"Raw" plotting**
221
+# QC - Remove empty droplets
222
+sce2<-subsetSCECols(sce, colData = c("type != 'EmptyDroplet'"))
219 223
 
220
-The minimum setting for `plotSCEHeatmap()` is the input SCE object and the data matrix to plot (default `"logcounts"`). In this way, all cells and features will be presented while no annotation or legend (except the main color scheme) will be shown.  
224
+# Normalize the counts 
225
+sce2<-runNormalization(sce2, useAssay = "counts", outAssayName = "logcounts",
226
+                        normalizationMethod = "logNormCounts",scale = TRUE)
221 227
 
222
-```{R hmFull, eval=TRUE, cache=TRUE}
223
-plotSCEHeatmap(sce, useAssay = "counts")
228
+# plot the data
229
+plotSCEHeatmap(sce2,useAssay = "logcounts",cluster_rows = TRUE, cluster_columns = TRUE)
224 230
 ```
225 231
 
226 232
 **Subsetting**
227 233
 
228 234
 SCTK allows relatively flexible approaches to select the cells/features to plot.  
229 235
 
230
-The basic way to subset the heatmap is to directly use an index vector that can subset the input SCE object to `featureIndex` and `cellIndex`, including `numeric`, and `logical` vectors, which are widely used, and `character` vector containing the row/col names. Of course, user can directly use a subsetted SCE object as input.  
236
+The basic way to subset the heatmap is to directly use an index vector that can subset the input SCE object to `featureIndex` and `cellIndex`, including `numeric`, and `logical` vectors, which are widely used, and `character` vector containing the row/col names. Of course, user can directly use a subsetted SCE object as input.  First let's run a simple clustering workflow to identify clusters and find DE genes for each cluster. We can subset the heatmap using this list of DE genes
237
+
238
+```{R idxSubset, eval=TRUE, cache=TRUE, message=FALSE,warnings=FALSE, echo=FALSE}
239
+
240
+# Run Clustering workflow
241
+set.seed(348389)
242
+sce2 <- runFeatureSelection(sce2, useAssay = "counts")
243
+sce2 <- setTopHVG(sce2, featureSubsetName = "hvf")
244
+sce2 <- runDimReduce(sce2, useAssay = "logcounts", useFeatureSubset = "hvf", scale = TRUE, reducedDimName = "PCA")
245
+sce2 <- runDimReduce(sce2, method = "scaterUMAP", useReducedDim = "PCA", reducedDimName = "UMAP", nComponents = 10)
246
+sce2 <- runScranSNN(inSCE = sce2, useReducedDim = "PCA", nComp = 10, clusterName = "scranSNN_PCA")
231 247
 
232
-```{R idxSubset, eval=TRUE, cache=TRUE}
233
-# Make up random downsampling numeric vector
234
-featureSubset <- sample(nrow(sce), 50)
235
-cellSubset <- sample(ncol(sce), 50)
248
+# set gene ID as rownames
249
+sce2<-setRowNames(sce2,"feature_name")
236 250
 
237
-plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = featureSubset, cellIndex = cellSubset)
251
+
252
+# Find markers for each cluster
253
+sce2 <- runFindMarker(sce2, useAssay = "logcounts", method = "wilcox", cluster = "scranSNN_PCA")
254
+topMarkers <- getFindMarkerTopTable(sce2, topN = 5, log2fcThreshold = 0.5, 
255
+                                    fdrThreshold = 0.05, minClustExprPerc = 0.5, 
256
+                                    maxCtrlExprPerc = 0.5, minMeanExpr = 0)
257
+
258
+# Using feature index to select for genes in topMarkers list 
259
+plotSCEHeatmap(sce2,useAssay = "logcounts",rowLabel = TRUE,featureIndex = topMarkers$Gene,cluster_columns = TRUE)
238 260
 ```
239 261
 
240 262
 ````{=html}
... ...
@@ -246,9 +268,11 @@ plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = featureSubset, c
246 268
 In a more complex situation, where users might only have a set of identifiers which are not inside the row/col names (i.e. unable to directly subset the SCE object), we provide another approach. The subset, in this situation, can be accessed via specifying a vector that contains the identifiers users have, to `featureIndexBy` or `cellIndexBy`. This specification allows directly giving one column name of `rowData` or `colData`.  
247 269
 
248 270
 ```{R indexBy, eval=TRUE, cache=TRUE}
249
-subsetFeatureName <- sample(rowData(sce)$feature_name, 50)
250
-subsetCellBarcode <- sample(sce$cell_barcode, 50)
251
-plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = subsetFeatureName, featureIndexBy = "feature_name", cellIndex = subsetCellBarcode, cellIndexBy = "cell_barcode")
271
+
272
+list_of_FIDs<-c("ENSG00000251562","ENSG00000205542","ENSG00000177954","ENSG00000166710")
273
+
274
+plotSCEHeatmap(inSCE = sce2, useAssay = "logcounts", featureIndexBy = "feature_ID",  featureIndex = list_of_FIDs, cluster_rows = TRUE, cluster_columns = TRUE, rowLabel = TRUE)
275
+
252 276
 ```
253 277
 
254 278
 ````{=html}
... ...
@@ -260,12 +284,8 @@ plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = subsetFeatureNam
260 284
 As introduced before, we allow directly using column names of `rowData` or `colData` to attach color bar annotations. To make use of this functionality, pass a `character` vector to `rowDataName` or `colDataName`. 
261 285
 
262 286
 ```{R colRowAnn, eval=TRUE, cache=TRUE}
263
-# Make up arbitrary annotation, 
264
-rowRandLabel <- c(rep('aa', 100), rep('bb', 100))
265
-rowData(sce)$randLabel <- rowRandLabel
266
-colRandLabel <- c(rep('cc', 195), rep('dd', 195))
267
-colData(sce)$randLabel <- colRandLabel
268
-plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = featureSubset, cellIndex = cellSubset, rowDataName = "randLabel", colDataName = c("type", "randLabel"))
287
+# Creat new annotation for markers 
288
+plotSCEHeatmap(inSCE = sce2, useAssay = "logcounts", featureIndex = topMarkers$Gene, colDataName = c( "scranSNN_PCA"),rowLabel = TRUE, cluster_rows = TRUE, cluster_columns = TRUE)
269 289
 ```
270 290
 
271 291
 ````{=html}
... ...
@@ -273,12 +293,12 @@ plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = featureSubset, c
273 293
   <summary><b>Customized Annotation</b></summary>
274 294
 ```` 
275 295
 
276
-Fully customized annotation is also supported, though it can be complexed for users. For the labeling, it is more recommanded to insert the information into `rowData` or `colData` and then make use. For coloring, information should be passed to `featureAnnotationColor` or `cellAnnotationColor`. The argument must be a `list` object with names matching the annotation classes (such as `"randLabel"` and `"type"`); each inner object under a name must be a named vector, with colors as the values and existing categories as the names. The working instance looks like this:
296
+Fully customized annotation is also supported, though it can be complex for users. For the labeling, it is more recommended to insert the information into `rowData` or `colData` and then make use. For coloring, information should be passed to `featureAnnotationColor` or `cellAnnotationColor`. The argument must be a `list` object with names matching the annotation classes (such as `"randLabel"` and `"type"`); each inner object under a name must be a named vector, with colors as the values and existing categories as the names. The working instance looks like this:
277 297
 
278 298
 ```{R colorEG, eval=FALSE, echo=FALSE}
279 299
 colAnnotattionColor <- list(
280 300
   sample = c(pbmc_4k = "FF4D4D"),
281
-  type = c(Singlet = "#4DFFFF", Doublet = "#FFC04D", EmptyDroplet = "#4D4DFF")
301
+  type = c(Singlet = "#4DFFFF", Doublet = "#FFC04D")
282 302
 )
283 303
 ```
284 304
 
... ...
@@ -291,7 +311,27 @@ colAnnotattionColor <- list(
291 311
 **1. Grouping/Splitting** In some cases, it might be better to do a "semi-heatmap" (i.e. split the rows/columns first and cluster them within each group) to visualize some expression pattern, such as evaluating the differential expression. For this need, use `rowSplitBy` or `colSplitBy`, and the arguments must be a `character` vector that is a subset of the specified annotation.  
292 312
 
293 313
 ```{R split, eval=TRUE, cache=TRUE}
294
-plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = featureSubset, cellIndex = cellSubset, rowDataName = "randLabel", colDataName = c("type", "randLabel"), rowSplitBy = "randLabel", colSplitBy = "type")
314
+
315
+# Create a new label in the rowData using the cluster markers
316
+
317
+data.frame(rowData(sce2)) %>% 
318
+  left_join(topMarkers, by = c("feature_name" = "Gene")) %>%
319
+  rename("cluster_markers" = "scranSNN_PCA") -> new_row_data
320
+
321
+rownames(new_row_data)<-new_row_data$feature_name
322
+
323
+rowData(sce2)<-new_row_data
324
+
325
+plotSCEHeatmap(inSCE = sce2, useAssay = "logcounts", featureIndex = topMarkers$Gene, colDataName = c("type"), aggregateCol = "scranSNN_PCA", rowGap = grid::unit(2, 'mm'),rowLabel = TRUE, rowDataName = "cluster_markers", rowSplitBy = "cluster_markers")
326
+
327
+# Adding a summary 
328
+
329
+data.frame(colData(sce2)) %>% 
330
+  mutate(summary_col = sample(5,n(), replace = TRUE)) -> new_col_data
331
+
332
+colData(sce2)<-DataFrame(new_col_data)
333
+
334
+plotSCEHeatmap(inSCE = sce2, useAssay = "logcounts", featureIndex = topMarkers$Gene, colDataName = c("type"), aggregateCol = "scranSNN_PCA", rowGap = grid::unit(2, 'mm'),rowLabel = TRUE, rowDataName = "cluster_markers", rowSplitBy = "cluster_markers", addCellSummary = "summary_col" )
295 335
 ```
296 336
 
297 337
 **2. Cell/Feature Labeling** Text labels of features or cells can be added via `rowLabel` or `colLabel`. Use `TRUE` or `FALSE` to specify whether to show the `rownames` or `colnames` of the subsetted SCE object. Additionally, giving a single string of a column name of `rowData` or `colData` can enable the labeling of the annotation. Furthermore, users can directly throw a character vector to the parameter, with the same length of either the full SCE object or the subsetted.  
... ...
@@ -301,7 +341,7 @@ plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = featureSubset, c
301 341
 **4. Row/Column titles** The row title (`"Genes"`) and column title (`"Cells"`) can be changed or removed by passing a string or `NULL` to `rowTitle` or `colTitle`, respectively.  
302 342
 
303 343
 ```{R label, eval=TRUE, cache=TRUE}
304
-plotSCEHeatmap(inSCE = sce, useAssay = "counts", featureIndex = featureSubset, cellIndex = cellSubset, rowLabel = "feature_name", colLabel = seq(ncol(sce)), colDend = FALSE, rowTitle = "Downsampled features")
344
+plotSCEHeatmap(inSCE = sce2, useAssay = "logcounts", featureIndex = topMarkers$Gene, rowGap = grid::unit(2, 'mm'),rowLabel = TRUE,  rowTitle = "Markers",colTitle = "Clusters", cluster_columns = TRUE, cluster_rows = TRUE)
305 345
 ```
306 346
 
307 347
 There are still some parameters not mentioned here, but they are not frequently used. Please refer to `?plotSCEHeatmap` as well as `?ComplexHeatmap::Heatmap`.