Former-commit-id: fa2385410e4cefdc694545daa48892d15ffe8921
... | ... |
@@ -170,7 +170,8 @@ setMethod("train_classifier", c("train_obj" = "Seurat"), |
170 | 170 |
clf$trainingData <- clf$resample <- clf$resampledCM <- NULL |
171 | 171 |
p_thres <- 0.5 |
172 | 172 |
|
173 |
- object <- SingleCellClassR(cell_type, clf, features, p_thres, NA_character_) |
|
173 |
+ object <- SingleCellClassR(cell_type, clf, labels(clf$terms), p_thres, |
|
174 |
+ NA_character_) |
|
174 | 175 |
|
175 | 176 |
# only assign parent if pretrained model for parent cell type is avai |
176 | 177 |
if ((!is.null(parent_process$parent.clf) |
... | ... |
@@ -266,7 +267,7 @@ setMethod("train_classifier", c("train_obj" = "SingleCellExperiment"), |
266 | 267 |
clf <- train_func(balance_ds$mat, balance_ds$tag) |
267 | 268 |
|
268 | 269 |
p_thres <- 0.5 |
269 |
- object <- SingleCellClassR(cell_type, clf, features, p_thres, |
|
270 |
+ object <- SingleCellClassR(cell_type, clf, labels(clf$terms), p_thres, |
|
270 | 271 |
NA_character_) |
271 | 272 |
|
272 | 273 |
# only assign parent if pretrained model for parent cell type is avai |
... | ... |
@@ -665,7 +666,7 @@ setMethod("classify_cells", c("classify_obj" = "Seurat"), |
665 | 666 |
|
666 | 667 |
if (any(pred_cells != "")) { |
667 | 668 |
pred_cells <- gsub("/$", "", pred_cells) |
668 |
- |
|
669 |
+ |
|
669 | 670 |
# ignore ambiguous results |
670 | 671 |
if (ignore_ambiguous_result == TRUE) { |
671 | 672 |
pred_cells <- unlist(lapply(pred_cells, |
... | ... |
@@ -728,12 +729,16 @@ setMethod("classify_cells", c("classify_obj" = "SingleCellExperiment"), |
728 | 729 |
prediction <- make_prediction(filtered_mat, classifier, pred_cells, ignore_ambiguous_result) |
729 | 730 |
pred <- prediction$pred |
730 | 731 |
pred_cells <- prediction$pred_cells |
731 |
- |
|
732 | 732 |
# add prediction to meta data: 2 cols: p, class |
733 | 733 |
for (colname in colnames(pred)) { |
734 |
- classify_obj[[colname]] <- pred[, colname, drop = FALSE] |
|
734 |
+ classify_obj[[colname]] <- unlist(lapply(colnames(classify_obj), |
|
735 |
+ function(x) |
|
736 |
+ if (x %in% rownames(pred)) {pred[x, colname]} |
|
737 |
+ else {NA})) |
|
738 |
+ #pred[, colname, drop = FALSE, ] |
|
735 | 739 |
} |
736 | 740 |
} |
741 |
+ |
|
737 | 742 |
if (any(pred_cells != "")) { |
738 | 743 |
pred_cells <- gsub("/$", "", pred_cells) |
739 | 744 |
|
... | ... |
@@ -26,7 +26,7 @@ |
26 | 26 |
#' @description Pretrained classifier obtained by training and testing on |
27 | 27 |
#' The Tabula Muris Consortium. |
28 | 28 |
#' @docType data |
29 |
-#' @usage default_models_n |
|
29 |
+#' @usage default_models_m |
|
30 | 30 |
#' @format a list of \code{\link{SingleCellClassR}} objects |
31 | 31 |
#' @author Vy Nguyen, November 2020 |
32 | 32 |
#' @keywords datasets |
... | ... |
@@ -615,17 +615,16 @@ make_prediction <- function(mat, classifier, pred_cells, ignore_ambiguous_result |
615 | 615 |
if(x[1] >= classifier@p_thres) {"yes"} else {"no"})) |
616 | 616 |
rownames(pred) <- rownames(mat) |
617 | 617 |
|
618 |
- # append a sumary to whole predicted cell type |
|
618 |
+ # append a summary to whole predicted cell type |
|
619 | 619 |
pred_cells <- unlist(lapply(cells, |
620 | 620 |
function(i) |
621 | 621 |
if (i %in% rownames(pred) && pred[i, "class"] == "yes") { |
622 | 622 |
if (ignore_ambiguous_result == TRUE && !is.na(classifier@parent) && |
623 | 623 |
gsub("/", "", pred_cells[i]) == classifier@parent) |
624 |
- { classifier@cell_type } |
|
624 |
+ { paste0(classifier@cell_type, "/") } |
|
625 | 625 |
else { paste0(pred_cells[i], classifier@cell_type, "/") } |
626 | 626 |
} |
627 | 627 |
else { pred_cells[i] })) |
628 |
- |
|
629 | 628 |
names(pred_cells) <- cells |
630 | 629 |
|
631 | 630 |
# remove no column and rename yes column to p |
... | ... |
@@ -706,12 +705,13 @@ verify_parent <- function(mat, classifier, meta.data) { |
706 | 705 |
# parent clf, if avai, always has to be applied before children clf. |
707 | 706 |
parent_slot <- paste0(c(unlist(strsplit(classifier@parent, split = " ")), "class"), collapse = "_") |
708 | 707 |
if (parent_slot %in% colnames(meta.data)) { |
709 |
- parent_pred <- meta.data[, parent_slot, drop = FALSE] |
|
710 |
- pos_parent <- rownames(parent_pred[parent_pred == 'yes', , drop=FALSE]) |
|
708 |
+ parent_pred <- meta.data[, parent_slot] |
|
709 |
+ pos_parent <- colnames(mat)[parent_pred == 'yes'] |
|
711 | 710 |
} else { |
712 | 711 |
warning(paste0('Parent classifier of ', classifier@cell_type, 'cannot be applied.\n |
713 | 712 |
Please list/save parent classifier before child(ren) classifier.\n |
714 |
- Skip applying classification models for ', classifier@cell_type, ' and its parent cell type.\n'), call. = FALSE, immediate. = TRUE) |
|
713 |
+ Skip applying classification models for ', classifier@cell_type, |
|
714 |
+ ' and its parent cell type.\n'), call. = FALSE, immediate. = TRUE) |
|
715 | 715 |
} |
716 | 716 |
|
717 | 717 |
if (!is.null(pos_parent)) { |
... | ... |
@@ -29,41 +29,67 @@ any other cell type. |
29 | 29 |
|
30 | 30 |
## Preparing train object and test object |
31 | 31 |
|
32 |
-The workflow starts from a couple of Seurat objects where cells have been |
|
32 |
+The workflow starts from a couple of objects where cells have been |
|
33 | 33 |
assigned to be different cell types. To do this, users may have annotated |
34 |
-scRNA-seq data (by a FACS-sorting process, for example), create a Seurat |
|
35 |
-object based on the sequencing data and assign the predetermined cell types |
|
36 |
-as Seurat meta data. If the scRNA-seq data has not been annotated yet, |
|
37 |
-another possible approach is to follow the basic Seurat workflow until |
|
38 |
-assigning cell type identity to clusters. |
|
34 |
+scRNA-seq data (by a FACS-sorting process, for example), create a Seurat/ |
|
35 |
+SingleCellExperiment (SCE) object based on the sequencing data and assign the |
|
36 |
+predetermined cell types as cell meta data. If the scRNA-seq data has not |
|
37 |
+been annotated yet, another possible approach is to follow the basic |
|
38 |
+workflow (Seurat, for example) until assigning cell type identity to clusters. |
|
39 |
+ |
|
40 |
+In this vignette, we use the human lung dataset from Zilionis et al., 2019, |
|
41 |
+which is available in the scRNAseq (2.4.0) library. The dataset is stored as a |
|
42 |
+SCE object. |
|
39 | 43 |
|
40 | 44 |
To start the training workflow, we first load the neccessary libraries. |
41 | 45 |
```{r} |
42 | 46 |
library(SingleCellClassR) |
43 |
-library(SingleCellClassR.data) |
|
47 |
+library(scRNAseq) |
|
44 | 48 |
``` |
45 | 49 |
|
46 |
-One Seurat object will be used as train object, while the other is the test |
|
47 |
-object. In this example, we used Sade-Feldman dataset to create the train |
|
48 |
-object. |
|
50 |
+Load the dataset: |
|
49 | 51 |
|
50 | 52 |
```{r} |
51 |
-data("feldman_seurat") |
|
52 |
-feldman_seurat |
|
53 |
+zilionis <- ZilionisLungData() |
|
53 | 54 |
``` |
54 | 55 |
|
55 |
-We load Jerby-Arnon dataset for the testing object. |
|
56 |
+We cut this dataset into two parts, one for the training and the other for the testing. |
|
56 | 57 |
```{r} |
57 |
-data("jerby_seurat") |
|
58 |
-jerby_seurat |
|
58 |
+pivot = ncol(zilionis)%/%2 |
|
59 |
+train_set <- zilionis[, 1:pivot] |
|
60 |
+test_set <- zilionis[, (1+pivot):ncol(zilionis)] |
|
59 | 61 |
``` |
60 |
-In our example, the cell type meta data is indicated as the active |
|
61 |
-identification of the Seurat object (in both train object and test |
|
62 |
-object). If cell type is stored in another slot of object meta data, |
|
62 |
+ |
|
63 |
+In this dataset, the cell type meta data is stored in the *Most likely LM22 cell type* |
|
64 |
+slot of the SingleCellExperiment object (in both train object and test object). |
|
65 |
+If cell type is stored in another slot of object meta data, |
|
63 | 66 |
the slot/tag slot name must be then provided as a parameter in the |
64 | 67 |
train and test method. |
65 | 68 |
```{r} |
66 |
-head(Idents(feldman_seurat)) |
|
69 |
+unique(train_set$`Most likely LM22 cell type`) |
|
70 |
+``` |
|
71 |
+```{r} |
|
72 |
+unique(test_set$`Most likely LM22 cell type`) |
|
73 |
+``` |
|
74 |
+ |
|
75 |
+We want to train a classifier for B cells and their phenotypes. Considering memory B cells, |
|
76 |
+naive B cells and plasma cells as B cell phenotypes, we convert all those cells to a uniform |
|
77 |
+cell label, ie. B cells. All non B cells are converted into 'others'. |
|
78 |
+ |
|
79 |
+```{r} |
|
80 |
+# change cell label |
|
81 |
+train_set$B_cell <- unlist(lapply(train_set$`Most likely LM22 cell type`, |
|
82 |
+ function(x) if (is.na(x)) {'ambiguous'} else if (x %in% c('Plasma cells', 'B cells memory', 'B cells naive')) {'B cells'} else {'others'})) |
|
83 |
+ |
|
84 |
+test_set$B_cell <- unlist(lapply(test_set$`Most likely LM22 cell type`, |
|
85 |
+ function(x) if (is.na(x)) {'ambiguous'} else if (x %in% c('Plasma cells', 'B cells memory', 'B cells naive')) {'B cells'} else {'others'})) |
|
86 |
+``` |
|
87 |
+ |
|
88 |
+We observe that there are cells marked NAs. Those can be understood as 1/different from all indicated cell types or 2/any unknown cell types. Here we consider the second case, ie. we don't know whether they are positive or negative to B cells. To avoid the affect of those NAs cells, we can assign them as 'ambiguous'. All cells tagged 'ambiguous' will be ignored by SingleCellClassR from training and testing. |
|
89 |
+ |
|
90 |
+We may want to check the number of cells in each category: |
|
91 |
+```{r} |
|
92 |
+table(train_set$B_cell) |
|
67 | 93 |
``` |
68 | 94 |
|
69 | 95 |
## Defining set of features |
... | ... |
@@ -83,13 +109,13 @@ selected_features_B <- c("CD19", "MS4A1", "SDC1", "CD79A", "CD79B", |
83 | 109 |
## Train model |
84 | 110 |
|
85 | 111 |
When the model is being trained, three most important information must be |
86 |
-provided are: the Seurat object used for training, the set of applied features |
|
112 |
+provided are: the Seurat/SCE object used for training, the set of applied features |
|
87 | 113 |
and the cell type defining the trained model. |
88 | 114 |
|
89 | 115 |
Cell type corresponding to the trained model must exist among identities |
90 | 116 |
assigned to cells in the trained Seurat object. Remember if cell types |
91 | 117 |
are not indicated as active identification of the trained object, name |
92 |
-of the tag slot in object meta data must be provided to the tag_slot parameter. |
|
118 |
+of the tag slot in object meta data must be provided to the sce_tag_slot parameter. |
|
93 | 119 |
|
94 | 120 |
When training on a imbalanced dataset, the trained model may bias toward the |
95 | 121 |
majority group and ignore the presence of the minority group. To avoid this, |
... | ... |
@@ -99,8 +125,8 @@ from the majority group. To use the same set of cells while training multiple |
99 | 125 |
times for one model, users can use set.seed. |
100 | 126 |
```{r} |
101 | 127 |
set.seed(123) |
102 |
-clf_B <- train_classifier(train_obj = feldman_seurat, |
|
103 |
-features = selected_features_B, cell_type = "B cells") |
|
128 |
+clf_B <- train_classifier(train_obj = train_set, cell_type = "B cells", features = selected_features_B, |
|
129 |
+ sce_assay = 'counts', sce_tag_slot = 'B_cell') |
|
104 | 130 |
``` |
105 | 131 |
```{r} |
106 | 132 |
clf_B |
... | ... |
@@ -115,7 +141,8 @@ clf(clf_B) |
115 | 141 |
## Test model |
116 | 142 |
|
117 | 143 |
```{r} |
118 |
-clf_B_test <- test_classifier(test_obj = jerby_seurat, classifier = clf_B) |
|
144 |
+clf_B_test <- test_classifier(test_obj = test_set, classifier = clf_B, |
|
145 |
+ sce_assay = 'counts', sce_tag_slot = 'B_cell') |
|
119 | 146 |
``` |
120 | 147 |
|
121 | 148 |
### Interpreting test model result |
... | ... |
@@ -138,7 +165,7 @@ With the same classification model, the sensitivity and the specification of cla |
138 | 165 |
clf_B_test$overall_roc |
139 | 166 |
``` |
140 | 167 |
|
141 |
-In this example of B cell classifier, the current threshold is at 0.5. The sensitivity is 0.9932203, and the specificity is 0.9890493 (FPR = 0.010950643). The higher sensitivity (0.9943503) can be reached if we set the p_thres at 0.4. However, we will have lower specificity (FPR = 0.013966037), which means that we misclassify more stranger cells as B cells. In contradiction, we may not retrieve all actual B cells with higher p_thres (0.6, for example). |
|
168 |
+In this example of B cell classifier, the current threshold is at 0.5. The higher sensitivity can be reached if we set the p_thres at 0.4. However, we will then have lower specificity, which means that we misclassify more stranger cells as B cells. In contradiction, we may not retrieve all actual B cells with higher p_thres (0.6, for example). |
|
142 | 169 |
|
143 | 170 |
There is of course a certain trade-off between the sensitivity and the specificity of the model. Depending on the need of the project or the user-own preference, a probability threshold giving higher sensitivity or higher specificity can be chosen. In our perspective, p_thres at 0.5 is a good choice for the current B cell model. |
144 | 171 |
|
... | ... |
@@ -50,33 +50,74 @@ clf_B <- default_models[['B cells']] |
50 | 50 |
clf_B |
51 | 51 |
``` |
52 | 52 |
## Preparing train object and test object |
53 |
+ |
|
53 | 54 |
Same as training for basic models, training for a child model also requires |
54 |
-a train (Seurat) object and a test (Seurat) object. All Seurat objects must |
|
55 |
+a train (Seurat/SCE) object and a test (Seurat/SCE) object. All objects must |
|
55 | 56 |
have a slot in meta data indicating the type of cells. Tag slot indicating |
56 | 57 |
parent cell type can also be provided. In this case, parent cell type tag |
57 | 58 |
will further be tested for coherence with the provided parent classifier. |
59 |
+ |
|
60 |
+Cell tagged as child cell type but incoherent to parent cell type will be |
|
61 |
+removed from training and testing for the child cell type classifier. |
|
62 |
+ |
|
63 |
+In this vignette, we use the human lung dataset from Zilionis et al., 2019, |
|
64 |
+which is available in the scRNAseq (2.4.0) library. The dataset is stored as a |
|
65 |
+SCE object. |
|
66 |
+ |
|
67 |
+To start the training workflow, we first load the neccessary libraries. |
|
58 | 68 |
```{r} |
59 |
-library(SingleCellClassR.data) |
|
60 |
-data("feldman_seurat") |
|
69 |
+library(scRNAseq) |
|
61 | 70 |
``` |
71 |
+ |
|
72 |
+Load the dataset: |
|
73 |
+ |
|
62 | 74 |
```{r} |
63 |
-# view train data |
|
64 |
-feldman_seurat |
|
75 |
+zilionis <- ZilionisLungData() |
|
65 | 76 |
``` |
77 |
+ |
|
78 |
+We cut this dataset into two parts, one for the training and the other for the testing. |
|
66 | 79 |
```{r} |
67 |
-# tag slot indicating parent cell type, B cells |
|
68 |
-head(Idents(feldman_seurat)) |
|
80 |
+pivot = ncol(zilionis)%/%2 |
|
81 |
+train_set <- zilionis[, 1:pivot] |
|
82 |
+test_set <- zilionis[, (1+pivot):ncol(zilionis)] |
|
69 | 83 |
``` |
84 |
+ |
|
85 |
+In this dataset, the cell type meta data is stored in the *Most likely LM22 cell type* |
|
86 |
+slot of the SCE object (in both train object and test object). |
|
87 |
+If cell type is stored in another slot of object meta data, |
|
88 |
+the slot/tag slot name must be then provided as a parameter in the |
|
89 |
+train and test method. |
|
90 |
+ |
|
70 | 91 |
```{r} |
71 |
-# tag slot indicating cell subset, plasma cells |
|
72 |
-head(feldman_seurat[['Plasma_cells']]) |
|
92 |
+table(train_set$`Most likely LM22 cell type`) |
|
73 | 93 |
``` |
74 |
-A second object for testing: |
|
75 | 94 |
```{r} |
76 |
-data("jerby_seurat") |
|
95 |
+table(test_set$`Most likely LM22 cell type`) |
|
96 |
+``` |
|
97 |
+ |
|
98 |
+Unlike the example of the training basic model, we will remove all NAs cells in order to reduce computationalc complexity. |
|
77 | 99 |
|
78 |
-# view test data |
|
79 |
-jerby_seurat |
|
100 |
+```{r} |
|
101 |
+# remove NAs cells |
|
102 |
+train_set <- train_set[, !is.na(train_set$`Most likely LM22 cell type`)] |
|
103 |
+test_set <- test_set[, !is.na(test_set$`Most likely LM22 cell type`)] |
|
104 |
+``` |
|
105 |
+ |
|
106 |
+```{r} |
|
107 |
+# convert cell label: |
|
108 |
+# 1 - positive to plasma cells, |
|
109 |
+# 0 - negative to plasma cells |
|
110 |
+train_set$plasma <- unlist(lapply(train_set$`Most likely LM22 cell type`, |
|
111 |
+ function(x) if (x == 'Plasma cells') {1} else {0})) |
|
112 |
+ |
|
113 |
+test_set$plasma <- unlist(lapply(test_set$`Most likely LM22 cell type`, |
|
114 |
+ function(x) if (x == 'Plasma cells') {1} else {0})) |
|
115 |
+``` |
|
116 |
+ |
|
117 |
+We may want to check the number of cells in each category: |
|
118 |
+```{r} |
|
119 |
+table(train_set$plasma) |
|
120 |
+# 1: plasma cells, 0: not plasma cells |
|
80 | 121 |
``` |
81 | 122 |
|
82 | 123 |
## Defining set of features |
... | ... |
@@ -109,17 +150,17 @@ path.to.models = '.'* |
109 | 150 |
Train the child classifier: |
110 | 151 |
```{r} |
111 | 152 |
set.seed(123) |
112 |
-clf_plasma <- train_classifier(train_obj = feldman_seurat, |
|
153 |
+clf_plasma <- train_classifier(train_obj = train_set, |
|
113 | 154 |
features = selected_features_plasma, cell_type = "Plasma cells", |
114 |
-seurat_tag_slot = 'Plasma_cells', parent_clf = clf_B) |
|
155 |
+sce_assay = 'counts', sce_tag_slot = 'plasma', parent_clf = clf_B) |
|
115 | 156 |
``` |
116 | 157 |
If B cells classifier has not been loaded to current working space, |
117 | 158 |
an equivalent training process should be: |
118 | 159 |
```{r} |
119 | 160 |
set.seed(123) |
120 |
-clf_plasma <- train_classifier(train_obj = feldman_seurat, |
|
161 |
+clf_plasma <- train_classifier(train_obj = train_set, |
|
121 | 162 |
features = selected_features_plasma, cell_type = "Plasma cells", |
122 |
-seurat_tag_slot = 'Plasma_cells', parent_cell = 'B cells') |
|
163 |
+sce_assay = 'counts', sce_tag_slot = 'plasma', parent_cell = 'B cells') |
|
123 | 164 |
``` |
124 | 165 |
```{r} |
125 | 166 |
clf_plasma |
... | ... |
@@ -132,35 +173,17 @@ clf(clf_plasma) |
132 | 173 |
|
133 | 174 |
Parent classifier must be also indicated in test method. |
134 | 175 |
```{r} |
135 |
-clf_plasma_test <- test_classifier(test_obj = jerby_seurat, |
|
136 |
-classifier = clf_plasma, seurat_tag_slot = 'Plasma_cells', parent_clf = clf_B) |
|
176 |
+clf_plasma_test <- test_classifier(test_obj = test_set, |
|
177 |
+classifier = clf_plasma, sce_assay = 'counts', sce_tag_slot = 'plasma', |
|
178 |
+parent_clf = clf_B) |
|
137 | 179 |
``` |
138 | 180 |
|
139 | 181 |
### Interpreting test model result |
182 |
+The test result obtained from a child model can be interpreted in the same way |
|
183 |
+as we do with the model for basic cell types. We can change the prediction |
|
184 |
+probability threshold according to the research project or personal preference |
|
185 |
+and plot a roc curve. |
|
140 | 186 |
|
141 |
-Same as testing a basic classification models, testing a child classifier also returns an object, which is a list of: *test_tag*, *pred*, *acc*, *auc*, *overall_roc* |
|
142 |
- |
|
143 |
-The *overall_roc* is a summary of True Positive Rate (sensitivity) and False Positive Rate (1 - specificity) obtained by the trained model according to different thresholds: |
|
144 |
- |
|
145 |
-```{r} |
|
146 |
-clf_plasma_test$overall_roc |
|
147 |
-``` |
|
148 |
- |
|
149 |
-We see that with the same TPR (*= 1.0*), the FPR increases if *p_thres* increases to *0.7*. Therefore, this is the better result as compare to the result produced by *p_thres = 0.5*. We change the prediction probability threshold as: |
|
150 |
- |
|
151 |
-```{r} |
|
152 |
-p_thres(clf_plasma) <- 0.7 |
|
153 |
-``` |
|
154 |
- |
|
155 |
-Hence the prediction result was changed as: |
|
156 |
-```{r} |
|
157 |
-clf_plasma_test <- test_classifier(test_obj = jerby_seurat, |
|
158 |
-classifier = clf_plasma, seurat_tag_slot = 'Plasma_cells', parent_clf = clf_B) |
|
159 |
-``` |
|
160 |
- |
|
161 |
-Now the specificity increased from *0.963759909399774* to *0.994337485843715* and with the same number of actual plasma cells (*65*), the number of predicted cells decreased from *97* to *70*. |
|
162 |
- |
|
163 |
-The ROC curve remains the same with the same AUC score: |
|
164 | 187 |
```{r} |
165 | 188 |
print(clf_plasma_test$auc) |
166 | 189 |
roc_curve <- plot_roc_curve(test_result = clf_plasma_test) |
... | ... |
@@ -172,11 +195,13 @@ plot(roc_curve) |
172 | 195 |
In order to save child classifier, parent classifier must have existed in the |
173 | 196 |
classifier database, either in the package default database or in user-defined |
174 | 197 |
database. |
198 |
+ |
|
175 | 199 |
```{r} |
176 | 200 |
# see list of available model in package |
177 | 201 |
data("default_models") |
178 | 202 |
names(default_models) |
179 | 203 |
``` |
204 |
+ |
|
180 | 205 |
In our package, default models include already models classifying plasma cells. |
181 | 206 |
Therefore, we will save this model to a new local database specified by the |
182 | 207 |
*path.to.models* parameter. If you start with a fresh new local database, |
... | ... |
@@ -195,61 +220,39 @@ save_new_model(new_model = clf_plasma, path.to.models = getwd(), |
195 | 220 |
|
196 | 221 |
When we save the B cells' classifier and the plasma cells' classifier, a local database is newly created. We can use this new database to classify cells in a Seurat or SingleCellExperiment object. |
197 | 222 |
|
198 |
-Let's try to classify Jerby-Arnon dataset: |
|
223 |
+Let's try to classify cells in the test set: |
|
199 | 224 |
```{r} |
200 |
-classified_jerby <- classify_cells(classify_obj = jerby_seurat, |
|
201 |
- cell_types = 'all', path_to_models = getwd()) |
|
225 |
+classified <- classify_cells(classify_obj = test_set, sce_assay = 'counts', |
|
226 |
+ cell_types = 'all', path_to_models = getwd()) |
|
202 | 227 |
``` |
203 | 228 |
|
204 | 229 |
Using the *classify_cells()* function, we have to indicate exactly the repository containing the database that the models has recently been saved to. In the previous section, we saved our new models to the current working directory. |
205 | 230 |
|
206 |
-In the *classified_jerby* object, the classification process added new columns to the cell meta data, including the *predicted_cell_type* and *most_probable_cell_type* columns. Let's take a look at the original plasma cells tag: |
|
207 |
- |
|
208 |
-```{r} |
|
209 |
-# get a summary of the plasma cells tag |
|
210 |
-table(classified_jerby[['Plasma_cells']][,1]) |
|
211 |
-# 1 corresponds to cells positive to B cells and 0 corresponds to cells negative to B cells |
|
212 |
-``` |
|
213 |
-65 cells (labeled 1) are plasma cells and 7121 cells are not plasma cells. |
|
231 |
+In the *classified* object, the classification process added new columns to the cell meta data, including the *predicted_cell_type* and *most_probable_cell_type* columns. |
|
214 | 232 |
|
215 | 233 |
If we use the full prediction to compare with actual plasma tag, we obtain this result: |
216 | 234 |
```{r} |
217 | 235 |
# compare the prediction with actual cell tag |
218 |
-table(classified_jerby[['predicted_cell_type']][,1], classified_jerby[['Plasma_cells']][,1]) |
|
236 |
+table(classified$predicted_cell_type, classified$plasma) |
|
237 |
+# plasma cell is child cell type of B cell |
|
238 |
+# so of course, all predicted plasma cells are predicted B cells |
|
219 | 239 |
``` |
220 | 240 |
|
221 |
-We find that 65 actual plasma cells were assigned as B cells and as plasma cells at the same time. This is a reasonable result because in this example, we consider all plasma cells are B cells. |
|
222 |
- |
|
223 |
-However, comparing the actual tag with the most probable prediction, we obtain: |
|
241 |
+When comparing the actual tag with the most probable prediction, we obtain: |
|
224 | 242 |
```{r} |
225 | 243 |
# compare the prediction with actual cell tag |
226 |
-table(classified_jerby[['most_probable_cell_type']][,1], classified_jerby[['Plasma_cells']][,1]) |
|
244 |
+table(classified$most_probable_cell_type, classified$plasma) |
|
227 | 245 |
``` |
228 | 246 |
|
229 |
-This is a quite surprise result. Among the 65 actual plasma cells, only 1 was identified but 64 others were identified as B cells. In contradiction, the testing process of *clf_plasma* proved that this model could detect all plasma cells in the same dataset. Why this can happen? This contradictory can be explained. |
|
230 |
- |
|
231 |
-The *predicted_cell_type* takes all predictions having the probabilities satisfying the corresponding probability thresholds. In this case, SingleCellClassR takes all predicted B cells having probability from 0.5 and all predicted plasma cells having probability from 0.7. Meanwhile, the *most_probable_cell_type* takes only the cell type which gives highest prediction probability. Here, 64 predicted plasma cells were also predicted as B cells. However, they have a higher probability to be B cells than to be plasma cells. Therefore, they were classified as B cells in the *most_probable_cell_type*. |
|
247 |
+The number of identified plasma cells is different in the *predicted_cell_type* slot and in the *most_probable_cell_type*. This is because the *predicted_cell_type* takes all predictions having the probabilities satisfying the corresponding probability thresholds. Meanwhile, the *most_probable_cell_type* takes only the cell type which gives highest prediction probability. |
|
232 | 248 |
|
233 | 249 |
To have all plasma cells specified as plasma cells, we can set the *ignore_ambiguous_result* to TRUE. This option will actually hide all ambiguous prediction in case we have more distinct cell types. In the parent-chid(ren) relationship of cell types, the more specified cell types/phenotypes will be reported. Of course, we don't obtain the *most_probable_cell_type* in cell meta data. |
234 | 250 |
|
235 | 251 |
```{r} |
236 |
-classified_jerby <- classify_cells(classify_obj = jerby_seurat, |
|
237 |
- cell_types = 'all', path_to_models = getwd(), |
|
238 |
- ignore_ambiguous_result = TRUE) |
|
239 |
-table(classified_jerby[['predicted_cell_type']][,1], classified_jerby[['Plasma_cells']][,1]) |
|
240 |
-``` |
|
241 |
- |
|
242 |
-To check the B cell classification with actual B cells, we can do as follows: |
|
243 |
- |
|
244 |
-```{r} |
|
245 |
-classified_jerby <- classify_cells(classify_obj = jerby_seurat, |
|
246 |
- cell_types = 'all', path_to_models = getwd()) |
|
247 |
-# convert B cells label to binaries to ignore details of other cell types |
|
248 |
-label_B <- Idents(classified_jerby) |
|
249 |
-label_B <- unlist(lapply(label_B, function(x) if (x == 'B cells') {1} else 0)) |
|
250 |
- |
|
251 |
-# compare prediction with cell label |
|
252 |
-table(classified_jerby[['most_probable_cell_type']][,1], label_B) |
|
252 |
+classified <- classify_cells(classify_obj = test_set, sce_assay = 'counts', |
|
253 |
+ cell_types = 'all', path_to_models = getwd(), |
|
254 |
+ ignore_ambiguous_result = TRUE) |
|
255 |
+table(classified$predicted_cell_type, classified$plasma) |
|
253 | 256 |
``` |
254 | 257 |
|
255 | 258 |
## Session Info |