Update scVI wrappers and add latent space model
... | ... |
@@ -343,6 +343,7 @@ wrapp.drimpute <- function(sce, |
343 | 343 |
} |
344 | 344 |
|
345 | 345 |
|
346 |
+ |
|
346 | 347 |
#' imp.scVI |
347 | 348 |
#' |
348 | 349 |
#' A function calling a python wrapper (`scVI.py`) around `scVI` imputation, adapted from the the 'Basic usage' Jupyter notebook (https://nbviewer.jupyter.org/github/YosefLab/scVI/blob/master/tests/notebooks/basic_tutorial.ipynb). Note that the function will create a temporary csv file for the intermediate storage of the input count matrix, needed by `scVI`. |
... | ... |
@@ -354,6 +355,7 @@ wrapp.drimpute <- function(sce, |
354 | 355 |
#' @param train_size Size of training set. Default to 0.8 but tutorial however recommends to use 1. |
355 | 356 |
#' @param n_cores N. cores |
356 | 357 |
#' |
358 |
+#' |
|
357 | 359 |
#' @return An object of the same class as `x` with updated slots. |
358 | 360 |
|
359 | 361 |
imp.scVI <- function(x, py_script = system.file("extdata", "scVI.py", package="pipeComp"), py_path = NULL, n_cores = 1L, train_size = 1) { |
... | ... |
@@ -361,14 +363,17 @@ imp.scVI <- function(x, py_script = system.file("extdata", "scVI.py", package="p |
361 | 363 |
suppressPackageStartupMessages(library(reticulate)) |
362 | 364 |
if (length(py_path)>0) use_python(py_path ,required=TRUE) |
363 | 365 |
trysource <- try(source_python(py_script)) |
364 |
- if (is(trysource, "try-error")) stop("Cannot source the python wrapper.") |
|
366 |
+ if (class(trysource) == "try-error") stop("Cannot source the python wrapper.") |
|
365 | 367 |
tfile <- tempfile(fileext=".csv", tmpdir = ".") |
366 | 368 |
write.csv(counts(x), tfile) |
367 |
- val <- t(scVI_imput(csv_file = tfile, csv_path = ".", n_cores = n_cores, train_size = train_size)) |
|
369 |
+ out <- scVI_imput(csv_file = tfile, csv_path = ".", n_cores = n_cores, |
|
370 |
+ train_size = train_size) |
|
371 |
+ val <- t(out[[1]]) |
|
372 |
+ gnames <- as.character(out[[2]]) |
|
368 | 373 |
file.remove(tfile) |
369 |
- dimnames(val) <- list(rownames(counts(x)), colnames(counts(x))) |
|
374 |
+ dimnames(val) <- list(gnames, colnames(counts(x))) |
|
375 |
+ x <- x[gnames, ] |
|
370 | 376 |
counts(x) <- as.matrix(val) |
371 |
- x |
|
372 | 377 |
} |
373 | 378 |
|
374 | 379 |
|
... | ... |
@@ -12,11 +12,12 @@ from scvi.inference import UnsupervisedTrainer |
12 | 12 |
from scvi import set_seed |
13 | 13 |
import torch |
14 | 14 |
|
15 |
-def scVI_imput(csv_file, csv_path, vae_model = VAE, train_size = 1, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): |
|
15 |
+def scVI_imput(csv_file, csv_path, vae_model = VAE, train_size = 1.0, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): |
|
16 | 16 |
set_seed(seed) |
17 | 17 |
dat = CsvDataset(csv_file, |
18 | 18 |
save_path=csv_path, |
19 |
- new_n_genes=None) |
|
19 |
+ new_n_genes=None) |
|
20 |
+ dat.subsample_genes(1000, mode="variance") |
|
20 | 21 |
# Based on recommendations in basic_tutorial.ipynb |
21 | 22 |
n_epochs = 400 if (len(dat) < 10000) else 200 |
22 | 23 |
# trainer and model |
... | ... |
@@ -34,14 +35,15 @@ def scVI_imput(csv_file, csv_path, vae_model = VAE, train_size = 1, n_labels = 0 |
34 | 35 |
# Updating the "minibatch" size after training is useful in low memory configurations |
35 | 36 |
full = full.update({"batch_size":32}) |
36 | 37 |
imp_val = full.sequential().imputation() |
37 |
- return(imp_val) |
|
38 |
+ return[imp_val, dat.gene_names] |
|
38 | 39 |
|
39 |
-def scVI_norm(csv_file, csv_path, vae_model = VAE, train_size = 1, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): |
|
40 |
+def scVI_norm(csv_file, csv_path, vae_model = VAE, train_size = 1.0, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): |
|
40 | 41 |
set_seed(seed) |
41 | 42 |
dat = CsvDataset(csv_file, |
42 | 43 |
save_path=csv_path, |
43 | 44 |
new_n_genes=None) |
44 |
- # Based on recommendations in basic_tutorial.ipynb |
|
45 |
+ dat.subsample_genes(1000, mode="variance") |
|
46 |
+ # Based on recommendations in basic_tutorial.ipynb |
|
45 | 47 |
n_epochs = 400 if (len(dat) < 10000) else 200 |
46 | 48 |
# trainer and model |
47 | 49 |
vae = vae_model(dat.nb_genes, n_labels = n_labels) |
... | ... |
@@ -57,7 +59,37 @@ def scVI_norm(csv_file, csv_path, vae_model = VAE, train_size = 1, n_labels = 0, |
57 | 59 |
full = trainer.create_posterior(trainer.model, dat, indices=np.arange(len(dat))) |
58 | 60 |
# Updating the "minibatch" size after training is useful in low memory configurations |
59 | 61 |
normalized_values = full.sequential().get_sample_scale() |
60 |
- return(normalized_values) |
|
62 |
+ return[normalized_values, dat.gene_names] |
|
63 |
+ |
|
64 |
+ |
|
65 |
+def scVI_latent(csv_file, csv_path, vae_model = VAE, train_size = 1.0, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): |
|
66 |
+ set_seed(seed) |
|
67 |
+ dat = CsvDataset(csv_file, |
|
68 |
+ save_path=csv_path, |
|
69 |
+ new_n_genes=None) |
|
70 |
+ # Based on recommendations in basic_tutorial.ipynb |
|
71 |
+ n_epochs = 400 if (len(dat) < 10000) else 200 |
|
72 |
+ # trainer and model |
|
73 |
+ vae = vae_model(dat.nb_genes, n_labels = n_labels) |
|
74 |
+ trainer = UnsupervisedTrainer( |
|
75 |
+ vae, |
|
76 |
+ dat, |
|
77 |
+ train_size=train_size, # default to 0.8, documentation recommends 1 |
|
78 |
+ use_cuda=use_cuda |
|
79 |
+ ) |
|
80 |
+ # limit cpu usage |
|
81 |
+ torch.set_num_threads(n_cores) |
|
82 |
+ trainer.train(n_epochs=n_epochs, lr=lr) |
|
83 |
+ full = trainer.create_posterior(trainer.model, dat, indices=np.arange(len(dat))) |
|
84 |
+ # Updating the "minibatch" size after training is useful in low memory configurations |
|
85 |
+ Z_hat = full.sequential().get_latent()[0] |
|
86 |
+ adata = anndata.AnnData(dat.X) |
|
87 |
+ for i, z in enumerate(Z_hat.T): |
|
88 |
+ adata.obs[f'Z_{i}'] = z |
|
89 |
+ # reordering for convenience and correspondance with PCA's ordering |
|
90 |
+ cellLoads = adata.obs.reindex(adata.obs.std().sort_values().index, axis = 1) |
|
91 |
+ return(cellLoads) |
|
92 |
+ |
|
61 | 93 |
|
62 | 94 |
|
63 | 95 |
from scvi.models import LDVAE |
... | ... |
@@ -484,7 +484,6 @@ norm.scnorm.scaled <- function(x, ...){ |
484 | 484 |
#' @param py_script Location of the python script |
485 | 485 |
#' @param py_path Optional. If scVI was installed in a specific python bin, pass here the path to it. |
486 | 486 |
#' @param train_size Size of training set. Default to 0.8 but tutorial however recommends to use 1. |
487 |
-#' @param noscale Logical; whether to disable scaling (default FALSE) |
|
488 | 487 |
#' @param n_cores N. cores |
489 | 488 |
#' |
490 | 489 |
#' @return An object of the same class as `x` with updated slots. |
... | ... |
@@ -494,13 +493,17 @@ norm.scVI <- function(x, py_script = system.file("extdata", "scVI.py", package=" |
494 | 493 |
suppressPackageStartupMessages(library(reticulate)) |
495 | 494 |
if (length(py_path)>0) use_python(py_path ,required=TRUE) |
496 | 495 |
trysource <- try(source_python(py_script)) |
497 |
- if (is(trysource, "try-error")) stop("Cannot source the python wrapper.") |
|
496 |
+ if (class(trysource) == "try-error") stop("Cannot source the python wrapper.") |
|
498 | 497 |
tfile <- tempfile(fileext=".csv", tmpdir = ".") |
499 | 498 |
if (is(x, "Seurat")) dat <- GetAssayData(x, assay = "RNA", slot = "counts") else dat <- counts(x) |
500 | 499 |
write.csv(dat, tfile) |
501 |
- val <- t(scVI_norm(csv_file = tfile, csv_path = ".", n_cores = n_cores, train_size = train_size)) |
|
500 |
+ out <- scVI_norm(csv_file = tfile, csv_path = ".", n_cores = n_cores, |
|
501 |
+ train_size = train_size) |
|
502 |
+ val <- t(out[[1]]) |
|
503 |
+ gnames <- as.character(out[[2]]) |
|
502 | 504 |
file.remove(tfile) |
503 |
- dimnames(val) <- list(rownames(dat), colnames(dat)) |
|
505 |
+ dimnames(val) <- list(gnames, colnames(dat)) |
|
506 |
+ x <- x[gnames, ] |
|
504 | 507 |
if(is(x,"Seurat")){ |
505 | 508 |
x <- SetAssayData(x, slot="data", new.data=as.matrix(val)) |
506 | 509 |
x <- SetAssayData(x, slot="scale.data", as.matrix(val)) |
... | ... |
@@ -729,6 +732,60 @@ scran.runPCA <- function(x, dims=50){ |
729 | 732 |
if(is(x, "Seurat")) return(sceDR2seurat(reducedDim(dat, "PCA"), x, "pca")) else return(dat) |
730 | 733 |
} |
731 | 734 |
|
735 |
+#' scVI.latent |
|
736 |
+#' |
|
737 |
+#' A function calling a python wrapper (`scVI.py`) around `scVI` low-dimensional latent space, adapted from the official tutorial (https://scvi.readthedocs.io/en/stable/tutorials/basic_tutorial.html). Note that the function will create a temporary csv file for the intermediate storage of the input count matrix, needed by `scVI`. |
|
738 |
+#' |
|
739 |
+#' |
|
740 |
+#' @param x A SCE or Seurat object. |
|
741 |
+#' @param py_script Location of the python script |
|
742 |
+#' @param py_path Optional. If scVI was installed in a specific python bin, pass here the path to it. |
|
743 |
+#' @param dims Number of dimensions to return. |
|
744 |
+#' @param learning_rate Learning rate of the model. If the model is not training properly due to too high learning rate, it will be reduced consecutively a few times before early stop. |
|
745 |
+#' @param n_cores N. cores |
|
746 |
+#' |
|
747 |
+#' @return An object of the same class as `x` with updated slots. Note that scVI-LD initially returns unordered components. For convenience with the package, they are ordered by sdev and renamed 'PC'. |
|
748 |
+ |
|
749 |
+scVI.latent <- function(x, dims = 50L, learning_rate = 1e-3, py_script = system.file("extdata", "scVI.py", package="pipeComp"), py_path = NULL, n_cores = 1L) { |
|
750 |
+ dims <- as.integer(dims) |
|
751 |
+ n_cores <- as.integer(n_cores) |
|
752 |
+ suppressPackageStartupMessages(library(reticulate)) |
|
753 |
+ if (length(py_path)>0) use_python(py_path ,required=TRUE) |
|
754 |
+ trysource <- try(source_python(py_script)) |
|
755 |
+ if (class(trysource) == "try-error") stop("Cannot source the python wrapper.") |
|
756 |
+ tfile <- tempfile(fileext=".csv", tmpdir = ".") |
|
757 |
+ if (is(x, "Seurat")) { |
|
758 |
+ dat <- GetAssayData(x, assay = "RNA", slot = "counts") |
|
759 |
+ dat <- dat[VariableFeatures(x), ] |
|
760 |
+ } else { |
|
761 |
+ |
|
762 |
+ dat <- counts(x) |
|
763 |
+ dat <- dat[metadata(x)$VariableFeat, ] |
|
764 |
+ } |
|
765 |
+ write.csv(dat, tfile) |
|
766 |
+ val <- try(scVI_latent(csv_file = tfile, csv_path = ".", n_cores = n_cores, lr = learning_rate)) |
|
767 |
+ # Error with some dataset; "Loss was NaN 10 consecutive times: the model is not training properly. Consider using a lower learning rate" --> reduce lr |
|
768 |
+ if (class(val) == "try-error") { |
|
769 |
+ ntry <- 0 |
|
770 |
+ while(ntry < 6 & class(val) == "try-error") { |
|
771 |
+ ntry <- ntry + 1 |
|
772 |
+ learning_rate <- learning_rate/10 |
|
773 |
+ message("Downgrading learning rate to ", learning_rate) |
|
774 |
+ val <- try(scVI_latent(csv_file = tfile, csv_path = ".", n_cores = n_cores, lr = learning_rate)) |
|
775 |
+ } |
|
776 |
+ } |
|
777 |
+ file.remove(tfile) |
|
778 |
+ rownames(val) <- colnames(dat) |
|
779 |
+ # rename |
|
780 |
+ colnames(val) <- paste0("PC_", 1:ncol(val)) |
|
781 |
+ if(is(x, "Seurat")){ |
|
782 |
+ x[["pca"]] <- CreateDimReducObject(embeddings=as.matrix(val), |
|
783 |
+ key="PC_", assay="RNA") |
|
784 |
+ } else { |
|
785 |
+ reducedDim(x, "PCA") <- as.matrix(val) |
|
786 |
+ } |
|
787 |
+ x |
|
788 |
+} |
|
732 | 789 |
|
733 | 790 |
|
734 | 791 |
#' scVI.LD |
... | ... |
@@ -741,7 +798,6 @@ scran.runPCA <- function(x, dims=50){ |
741 | 798 |
#' @param py_path Optional. If scVI was installed in a specific python bin, pass here the path to it. |
742 | 799 |
#' @param dims Number of dimensions to return. |
743 | 800 |
#' @param learning_rate Learning rate of the model. If the model is not training properly due to too high learning rate, it will be reduced consecutively a few times before early stop. |
744 |
-#' @param noscale Logical; whether to disable scaling (default FALSE) |
|
745 | 801 |
#' @param n_cores N. cores |
746 | 802 |
#' |
747 | 803 |
#' @return An object of the same class as `x` with updated slots. Note that scVI-LD initially returns unordered components. For convenience with the package, they are ordered by sdev and renamed 'PC'. |
... | ... |
@@ -752,15 +808,22 @@ scVI.LD <- function(x, dims = 50L, learning_rate = 1e-3, py_script = system.file |
752 | 808 |
suppressPackageStartupMessages(library(reticulate)) |
753 | 809 |
if (length(py_path)>0) use_python(py_path ,required=TRUE) |
754 | 810 |
trysource <- try(source_python(py_script)) |
755 |
- if (is(trysource, "try-error")) stop("Cannot source the python wrapper.") |
|
811 |
+ if (class(trysource) == "try-error") stop("Cannot source the python wrapper.") |
|
756 | 812 |
tfile <- tempfile(fileext=".csv", tmpdir = ".") |
757 |
- if (is(x, "Seurat")) dat <- GetAssayData(x, assay = "RNA", slot = "counts") else dat <- counts(x) |
|
813 |
+ if (is(x, "Seurat")) { |
|
814 |
+ dat <- GetAssayData(x, assay = "RNA", slot = "counts") |
|
815 |
+ dat <- dat[VariableFeatures(x), ] |
|
816 |
+ } else { |
|
817 |
+ |
|
818 |
+ dat <- counts(x) |
|
819 |
+ dat <- dat[metadata(x)$VariableFeat, ] |
|
820 |
+ } |
|
758 | 821 |
write.csv(dat, tfile) |
759 | 822 |
val <- try(scVI_ld(csv_file = tfile, ndims = dims, csv_path = ".", n_cores = n_cores, lr = learning_rate)) |
760 | 823 |
# Error with some dataset; "Loss was NaN 10 consecutive times: the model is not training properly. Consider using a lower learning rate" --> reduce lr |
761 |
- if (is(val, "try-error")) { |
|
824 |
+ if (class(val) == "try-error") { |
|
762 | 825 |
ntry <- 0 |
763 |
- while(ntry < 6 & is(val, "try-error")) { |
|
826 |
+ while(ntry < 6 & class(val) == "try-error") { |
|
764 | 827 |
ntry <- ntry + 1 |
765 | 828 |
learning_rate <- learning_rate/10 |
766 | 829 |
message("Downgrading learning rate to ", learning_rate) |