Browse code

Merge pull request #40 from ansonrel/master

Update scVI wrappers and add latent space model

Pierre-Luc authored on 03/07/2020 10:59:23 • GitHub committed on 03/07/2020 10:59:23
Showing 3 changed files

... ...
@@ -343,6 +343,7 @@ wrapp.drimpute <- function(sce,
343 343
 }
344 344
 
345 345
 
346
+
346 347
 #' imp.scVI
347 348
 #'
348 349
 #' A function calling a python wrapper (`scVI.py`) around `scVI` imputation, adapted from the the 'Basic usage' Jupyter notebook (https://nbviewer.jupyter.org/github/YosefLab/scVI/blob/master/tests/notebooks/basic_tutorial.ipynb). Note that the function will create a temporary csv file for the intermediate storage of the input count matrix, needed by `scVI`.  
... ...
@@ -354,6 +355,7 @@ wrapp.drimpute <- function(sce,
354 355
 #' @param train_size Size of training set. Default to 0.8 but tutorial however recommends to use 1. 
355 356
 #' @param n_cores N. cores
356 357
 #' 
358
+#' 
357 359
 #' @return An object of the same class as `x` with updated slots.
358 360
 
359 361
 imp.scVI <- function(x, py_script = system.file("extdata", "scVI.py", package="pipeComp"), py_path = NULL, n_cores = 1L, train_size = 1) {
... ...
@@ -361,14 +363,17 @@ imp.scVI <- function(x, py_script = system.file("extdata", "scVI.py", package="p
361 363
   suppressPackageStartupMessages(library(reticulate))
362 364
   if (length(py_path)>0) use_python(py_path ,required=TRUE)
363 365
   trysource <- try(source_python(py_script))
364
-  if (is(trysource, "try-error")) stop("Cannot source the python wrapper.") 
366
+  if (class(trysource) == "try-error") stop("Cannot source the python wrapper.") 
365 367
   tfile <- tempfile(fileext=".csv", tmpdir = ".")
366 368
   write.csv(counts(x), tfile)
367
-  val <- t(scVI_imput(csv_file = tfile, csv_path = ".", n_cores = n_cores, train_size = train_size))
369
+  out <- scVI_imput(csv_file = tfile, csv_path = ".", n_cores = n_cores,
370
+                    train_size = train_size)
371
+  val <- t(out[[1]])
372
+  gnames <- as.character(out[[2]])
368 373
   file.remove(tfile)
369
-  dimnames(val) <- list(rownames(counts(x)), colnames(counts(x)))
374
+  dimnames(val) <- list(gnames, colnames(counts(x)))
375
+  x <- x[gnames, ]
370 376
   counts(x) <- as.matrix(val)
371
-  x
372 377
 }
373 378
 
374 379
 
... ...
@@ -12,11 +12,12 @@ from scvi.inference import UnsupervisedTrainer
12 12
 from scvi import set_seed
13 13
 import torch
14 14
 
15
-def scVI_imput(csv_file, csv_path, vae_model = VAE, train_size = 1, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): 
15
+def scVI_imput(csv_file, csv_path, vae_model = VAE, train_size = 1.0, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): 
16 16
   set_seed(seed)
17 17
   dat = CsvDataset(csv_file, 
18 18
                   save_path=csv_path, 
19
-                   new_n_genes=None) 
19
+                  new_n_genes=None) 
20
+  dat.subsample_genes(1000, mode="variance")
20 21
   # Based on recommendations in basic_tutorial.ipynb             
21 22
   n_epochs = 400 if (len(dat) < 10000) else 200 
22 23
   # trainer and model 
... ...
@@ -34,14 +35,15 @@ def scVI_imput(csv_file, csv_path, vae_model = VAE, train_size = 1, n_labels = 0
34 35
   # Updating the "minibatch" size after training is useful in low memory configurations
35 36
   full = full.update({"batch_size":32})
36 37
   imp_val = full.sequential().imputation()
37
-  return(imp_val)
38
+  return[imp_val, dat.gene_names]
38 39
   
39
-def scVI_norm(csv_file, csv_path, vae_model = VAE, train_size = 1, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): 
40
+def scVI_norm(csv_file, csv_path, vae_model = VAE, train_size = 1.0, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): 
40 41
   set_seed(seed)
41 42
   dat = CsvDataset(csv_file, 
42 43
                   save_path=csv_path, 
43 44
                    new_n_genes=None) 
44
-  # Based on recommendations in basic_tutorial.ipynb             
45
+  dat.subsample_genes(1000, mode="variance")
46
+  # Based on recommendations in basic_tutorial.ipynb    
45 47
   n_epochs = 400 if (len(dat) < 10000) else 200 
46 48
   # trainer and model 
47 49
   vae = vae_model(dat.nb_genes, n_labels = n_labels)
... ...
@@ -57,7 +59,37 @@ def scVI_norm(csv_file, csv_path, vae_model = VAE, train_size = 1, n_labels = 0,
57 59
   full = trainer.create_posterior(trainer.model, dat, indices=np.arange(len(dat)))
58 60
   # Updating the "minibatch" size after training is useful in low memory configurations
59 61
   normalized_values = full.sequential().get_sample_scale()
60
-  return(normalized_values)
62
+  return[normalized_values, dat.gene_names]
63
+
64
+
65
+def scVI_latent(csv_file, csv_path, vae_model = VAE, train_size = 1.0, n_labels = 0, seed = 1234, n_cores = 1, lr = 1e-3, use_cuda = False): 
66
+  set_seed(seed)
67
+  dat = CsvDataset(csv_file, 
68
+                  save_path=csv_path, 
69
+                   new_n_genes=None) 
70
+  # Based on recommendations in basic_tutorial.ipynb    
71
+  n_epochs = 400 if (len(dat) < 10000) else 200 
72
+  # trainer and model 
73
+  vae = vae_model(dat.nb_genes, n_labels = n_labels)
74
+  trainer = UnsupervisedTrainer(
75
+    vae,
76
+    dat,
77
+    train_size=train_size, # default to 0.8, documentation recommends 1
78
+    use_cuda=use_cuda    
79
+    )
80
+  # limit cpu usage
81
+  torch.set_num_threads(n_cores) 
82
+  trainer.train(n_epochs=n_epochs, lr=lr)
83
+  full = trainer.create_posterior(trainer.model, dat, indices=np.arange(len(dat)))
84
+  # Updating the "minibatch" size after training is useful in low memory configurations
85
+  Z_hat = full.sequential().get_latent()[0]
86
+  adata = anndata.AnnData(dat.X)
87
+  for i, z in enumerate(Z_hat.T):
88
+      adata.obs[f'Z_{i}'] = z
89
+  # reordering for convenience and correspondance with PCA's ordering
90
+  cellLoads = adata.obs.reindex(adata.obs.std().sort_values().index, axis = 1)
91
+  return(cellLoads)
92
+
61 93
 
62 94
 
63 95
 from scvi.models import LDVAE
... ...
@@ -484,7 +484,6 @@ norm.scnorm.scaled <- function(x, ...){
484 484
 #' @param py_script Location of the python script
485 485
 #' @param py_path Optional. If scVI was installed in a specific python bin, pass here the path to it. 
486 486
 #' @param train_size Size of training set. Default to 0.8 but tutorial however recommends to use 1. 
487
-#' @param noscale Logical; whether to disable scaling (default FALSE)
488 487
 #' @param n_cores N. cores
489 488
 #' 
490 489
 #' @return An object of the same class as `x` with updated slots.
... ...
@@ -494,13 +493,17 @@ norm.scVI <- function(x, py_script = system.file("extdata", "scVI.py", package="
494 493
   suppressPackageStartupMessages(library(reticulate))
495 494
   if (length(py_path)>0) use_python(py_path ,required=TRUE)
496 495
   trysource <- try(source_python(py_script))
497
-  if (is(trysource, "try-error")) stop("Cannot source the python wrapper.") 
496
+  if (class(trysource) == "try-error") stop("Cannot source the python wrapper.") 
498 497
   tfile <- tempfile(fileext=".csv", tmpdir = ".")
499 498
   if (is(x, "Seurat")) dat <- GetAssayData(x, assay = "RNA", slot = "counts") else dat <- counts(x)
500 499
   write.csv(dat, tfile)
501
-  val <- t(scVI_norm(csv_file = tfile, csv_path = ".", n_cores = n_cores, train_size = train_size))
500
+  out <- scVI_norm(csv_file = tfile, csv_path = ".", n_cores = n_cores, 
501
+                   train_size = train_size)
502
+  val <- t(out[[1]])
503
+  gnames <- as.character(out[[2]])
502 504
   file.remove(tfile)
503
-  dimnames(val) <- list(rownames(dat), colnames(dat))
505
+  dimnames(val) <- list(gnames, colnames(dat))
506
+  x <- x[gnames, ]
504 507
   if(is(x,"Seurat")){ 
505 508
     x <- SetAssayData(x, slot="data", new.data=as.matrix(val))
506 509
     x <- SetAssayData(x, slot="scale.data", as.matrix(val))
... ...
@@ -729,6 +732,60 @@ scran.runPCA <- function(x, dims=50){
729 732
   if(is(x, "Seurat")) return(sceDR2seurat(reducedDim(dat, "PCA"), x, "pca")) else return(dat)
730 733
 }
731 734
 
735
+#' scVI.latent
736
+#'
737
+#' A function calling a python wrapper (`scVI.py`) around `scVI` low-dimensional latent space, adapted from the official tutorial (https://scvi.readthedocs.io/en/stable/tutorials/basic_tutorial.html). Note that the function will create a temporary csv file for the intermediate storage of the input count matrix, needed by `scVI`.  
738
+#' 
739
+#' 
740
+#' @param x A SCE or Seurat object.
741
+#' @param py_script Location of the python script
742
+#' @param py_path Optional. If scVI was installed in a specific python bin, pass here the path to it. 
743
+#' @param dims Number of dimensions to return. 
744
+#' @param learning_rate Learning rate of the model. If the model is not training properly due to too high learning rate, it will be reduced consecutively a few times before early stop. 
745
+#' @param n_cores N. cores
746
+#' 
747
+#' @return An object of the same class as `x` with updated slots. Note that scVI-LD initially returns unordered components. For convenience with the package, they are ordered by sdev and renamed 'PC'. 
748
+
749
+scVI.latent <- function(x, dims = 50L, learning_rate = 1e-3, py_script = system.file("extdata", "scVI.py", package="pipeComp"), py_path = NULL, n_cores = 1L) {
750
+  dims <- as.integer(dims)
751
+  n_cores <- as.integer(n_cores)
752
+  suppressPackageStartupMessages(library(reticulate))
753
+  if (length(py_path)>0) use_python(py_path ,required=TRUE)
754
+  trysource <- try(source_python(py_script))
755
+  if (class(trysource) == "try-error") stop("Cannot source the python wrapper.") 
756
+  tfile <- tempfile(fileext=".csv", tmpdir = ".")
757
+  if (is(x, "Seurat")) {
758
+    dat <- GetAssayData(x, assay = "RNA", slot = "counts")
759
+    dat <- dat[VariableFeatures(x), ]
760
+  } else {
761
+    
762
+    dat <- counts(x)
763
+    dat <- dat[metadata(x)$VariableFeat, ]
764
+  } 
765
+  write.csv(dat, tfile)
766
+  val <- try(scVI_latent(csv_file = tfile, csv_path = ".", n_cores = n_cores, lr = learning_rate))
767
+  # Error with some dataset; "Loss was NaN 10 consecutive times: the model is not training properly. Consider using a lower learning rate" --> reduce lr
768
+  if (class(val) == "try-error") {
769
+    ntry <- 0
770
+    while(ntry < 6 & class(val) == "try-error") {
771
+      ntry <- ntry + 1
772
+      learning_rate <- learning_rate/10
773
+      message("Downgrading learning rate to ", learning_rate)
774
+      val <- try(scVI_latent(csv_file = tfile, csv_path = ".", n_cores = n_cores, lr = learning_rate))
775
+    }
776
+  }
777
+  file.remove(tfile)
778
+  rownames(val) <- colnames(dat)
779
+  # rename
780
+  colnames(val) <- paste0("PC_", 1:ncol(val))
781
+  if(is(x, "Seurat")){
782
+    x[["pca"]] <- CreateDimReducObject(embeddings=as.matrix(val), 
783
+                                       key="PC_", assay="RNA")
784
+  } else {
785
+    reducedDim(x, "PCA") <- as.matrix(val)
786
+  }
787
+  x
788
+}
732 789
 
733 790
 
734 791
 #' scVI.LD
... ...
@@ -741,7 +798,6 @@ scran.runPCA <- function(x, dims=50){
741 798
 #' @param py_path Optional. If scVI was installed in a specific python bin, pass here the path to it. 
742 799
 #' @param dims Number of dimensions to return. 
743 800
 #' @param learning_rate Learning rate of the model. If the model is not training properly due to too high learning rate, it will be reduced consecutively a few times before early stop. 
744
-#' @param noscale Logical; whether to disable scaling (default FALSE)
745 801
 #' @param n_cores N. cores
746 802
 #' 
747 803
 #' @return An object of the same class as `x` with updated slots. Note that scVI-LD initially returns unordered components. For convenience with the package, they are ordered by sdev and renamed 'PC'. 
... ...
@@ -752,15 +808,22 @@ scVI.LD <- function(x, dims = 50L, learning_rate = 1e-3, py_script = system.file
752 808
   suppressPackageStartupMessages(library(reticulate))
753 809
   if (length(py_path)>0) use_python(py_path ,required=TRUE)
754 810
   trysource <- try(source_python(py_script))
755
-  if (is(trysource, "try-error")) stop("Cannot source the python wrapper.") 
811
+  if (class(trysource) == "try-error") stop("Cannot source the python wrapper.") 
756 812
   tfile <- tempfile(fileext=".csv", tmpdir = ".")
757
-  if (is(x, "Seurat")) dat <- GetAssayData(x, assay = "RNA", slot = "counts") else dat <- counts(x)
813
+  if (is(x, "Seurat")) {
814
+    dat <- GetAssayData(x, assay = "RNA", slot = "counts")
815
+    dat <- dat[VariableFeatures(x), ]
816
+  } else {
817
+    
818
+    dat <- counts(x)
819
+    dat <- dat[metadata(x)$VariableFeat, ]
820
+  } 
758 821
   write.csv(dat, tfile)
759 822
   val <- try(scVI_ld(csv_file = tfile, ndims = dims, csv_path = ".", n_cores = n_cores, lr = learning_rate))
760 823
   # Error with some dataset; "Loss was NaN 10 consecutive times: the model is not training properly. Consider using a lower learning rate" --> reduce lr
761
-  if (is(val, "try-error")) {
824
+  if (class(val) == "try-error") {
762 825
     ntry <- 0
763
-    while(ntry < 6 & is(val, "try-error")) {
826
+    while(ntry < 6 & class(val) == "try-error") {
764 827
       ntry <- ntry + 1
765 828
       learning_rate <- learning_rate/10
766 829
       message("Downgrading learning rate to ", learning_rate)