Browse code

Finished up propagation and removal of NAs for the GSVA method

Robert Castelo authored on 24/10/2024 18:10:23
Showing 3 changed files

... ...
@@ -109,10 +109,14 @@ compute.gene.cdf <- function(expr, sample.idxs, Gaussk=TRUE, kernel=TRUE,
109 109
             else
110 110
                 gene.cdf <- .ecdfvals_sparse_to_dense(expr[, sample.idxs, drop=FALSE],
111 111
                                                       verbose)
112
-        } else if (is.matrix(expr))
113
-            gene.cdf <- .ecdfvals_dense_to_dense(expr[, sample.idxs, drop=FALSE],
112
+        } else if (is.matrix(expr)) {
113
+            if (any_na)
114
+                gene.cdf <- .ecdfvals_dense_to_dense_nas(expr[, sample.idxs, drop=FALSE],
115
+                                                         verbose)
116
+            else
117
+                gene.cdf <- .ecdfvals_dense_to_dense(expr[, sample.idxs, drop=FALSE],
114 118
                                                  verbose)
115
-        else
119
+        } else
116 120
             stop(sprintf("Matrix class %s cannot be handled yet.", class(expr)))
117 121
     }
118 122
 
... ...
@@ -973,22 +977,20 @@ setMethod("gsvaEnrichment", signature(param="gsvaRanksParam"),
973 977
                   walkStat <- .gsvaRndWalk_nas(gSetIdx, decOrdStat, symRnkStat,
974 978
                                                tau, na_use, minSize, wna_env)
975 979
                   maxDev <- c(NA_real_, NA_real_)
976
-                  if (na_use == "na.rm")
977
-                      maxDev <- c(max(c(0, max(walkStat, na.rm=TRUE))),
978
-                                  min(c(0, min(walkStat, na.rm=TRUE))))
979
-                  else
980
-                      maxDev <- c(max(c(0, max(walkStat))),
981
-                                  min(c(0, min(walkStat))))
980
+                  if (any(!is.na(walkStat))) {
981
+                      if (na_use == "na.rm")
982
+                          maxDev <- c(max(c(0, max(walkStat, na.rm=TRUE))),
983
+                                      min(c(0, min(walkStat, na.rm=TRUE))))
984
+                      else
985
+                          maxDev <- c(max(c(0, max(walkStat))),
986
+                                      min(c(0, min(walkStat))))
987
+                  }
982 988
                   maxDev
983 989
                 }, rnkstats$dos, rnkstats$srs, na_use)
984 990
                 md <- do.call("rbind", md)
985 991
                 if (maxDiff && absRanking)
986 992
                     md[, 2] <- -1 * md[, 2]
987
-                sco <- rep(NA_real_, length(geneSetsIdx))
988
-                if (na_use == "na.rm")
989
-                    sco <- rowSums(md, na.rm=TRUE)
990
-                else 
991
-                    sco <- rowSums(md)
993
+                sco <- rowSums(md)
992 994
                 if (!maxDiff) {
993 995
                     mask <- is.na(sco)
994 996
                     sco[!mask] <- md[cbind(1:sum(!mask), ifelse(sco[!mask] > 0,
... ...
@@ -1026,22 +1028,20 @@ setMethod("gsvaEnrichment", signature(param="gsvaRanksParam"),
1026 1028
                   walkStat <- .gsvaRndWalk_nas(gSetIdx, decOrdStat, symRnkStat,
1027 1029
                                                tau, na_use, minSize, wna_env)
1028 1030
                   maxDev <- c(NA_real_, NA_real_)
1029
-                  if (na_use == "na.rm")
1030
-                      maxDev <- c(max(c(0, max(walkStat, na.rm=TRUE))),
1031
-                                  min(c(0, min(walkStat, na.rm=TRUE))))
1032
-                  else
1033
-                      maxDev <- c(max(c(0, max(walkStat))),
1034
-                                  min(c(0, min(walkStat))))
1031
+                  if (any(!is.na(walkStat))) {
1032
+                      if (na_use == "na.rm")
1033
+                          maxDev <- c(max(c(0, max(walkStat, na.rm=TRUE))),
1034
+                                      min(c(0, min(walkStat, na.rm=TRUE))))
1035
+                      else
1036
+                          maxDev <- c(max(c(0, max(walkStat))),
1037
+                                      min(c(0, min(walkStat))))
1038
+                  }
1035 1039
                   maxDev
1036 1040
                 }, rnkstats$dos, rnkstats$srs, na_use)
1037 1041
                 md <- do.call("rbind", md)
1038 1042
                 if (maxDiff && absRanking)
1039 1043
                     md[, 2] <- -1 * md[, 2]
1040
-                sco <- rep(NA_real_, length(geneSetsIdx))
1041
-                if (na_use == "na.rm")
1042
-                    sco <- rowSums(md, na.rm=TRUE)
1043
-                else 
1044
-                    sco <- rowSums(md)
1044
+                sco <- rowSums(md)
1045 1045
                 if (!maxDiff) {
1046 1046
                     mask <- is.na(sco)
1047 1047
                     sco[!mask] <- md[cbind(1:sum(!mask), ifelse(sco[!mask] > 0,
... ...
@@ -1225,6 +1225,12 @@ setMethod("gsvaEnrichment", signature(param="gsvaRanksParam"),
1225 1225
   .Call("ecdfvals_dense_to_dense_R", X, verbose)
1226 1226
 }
1227 1227
 
1228
+.ecdfvals_dense_to_dense_nas <- function(X, verbose) {
1229
+  stopifnot(is.matrix(X)) ## QC
1230
+  stopifnot(is.logical(verbose)) ## QC
1231
+  .Call("ecdfvals_dense_to_dense_nas_R", X, verbose)
1232
+}
1233
+
1228 1234
 .kcdfvals_sparse_to_sparse <- function(X, Gaussk, verbose) {
1229 1235
   stopifnot(is(X, "CsparseMatrix")) ## QC
1230 1236
   Xrsp <- as(X, "RsparseMatrix")
... ...
@@ -32,6 +32,9 @@ ecdfvals_sparse_to_dense_R(SEXP XCspR, SEXP XRspR, SEXP verboseR);
32 32
 SEXP
33 33
 ecdfvals_dense_to_dense_R(SEXP XR, SEXP verboseR);
34 34
 
35
+SEXP
36
+ecdfvals_dense_to_dense_nas_R(SEXP XR, SEXP verboseR);
37
+
35 38
 SEXP
36 39
 match_int(SEXP x, SEXP table);
37 40
 
... ...
@@ -513,3 +516,134 @@ ecdfvals_dense_to_dense_R(SEXP XR, SEXP verboseR) {
513 516
 
514 517
   return(ecdfRobj);
515 518
 }
519
+
520
+/* calculate empirical cumulative distribution function values
521
+ * on the zero and nonzero entries from the input dense matrix,
522
+ * when missing (NA) values are present, the returned value is
523
+ * a dense matrix.
524
+ */
525
+SEXP
526
+ecdfvals_dense_to_dense_nas_R(SEXP XR, SEXP verboseR) {
527
+  double* X;
528
+  Rboolean verbose=asLogical(verboseR);
529
+  int     nr, nc;
530
+  SEXP    ecdfRobj;
531
+  double* ecdf_vals;
532
+  SEXP pb = R_NilValue;
533
+  int  nunprotect=0;
534
+
535
+  PROTECT(XR); nunprotect++;
536
+
537
+  nr = INTEGER(getAttrib(XR, R_DimSymbol))[0]; /* number of rows */
538
+  nc = INTEGER(getAttrib(XR, R_DimSymbol))[1]; /* number of columns */
539
+  X  = REAL(XR);
540
+
541
+  /* create a new dense matrix object to store the result,
542
+   * if nr * nc > INT_MAX and LONG_VECTOR_SUPPORT is not
543
+   * available, the function allocMatrix() will prompt an error */
544
+  ecdfRobj = PROTECT(allocMatrix(REALSXP, nr, nc)); nunprotect++;
545
+
546
+  if (verbose) {
547
+    pb = PROTECT(cli_progress_bar(nr, NULL));
548
+    cli_progress_set_name(pb, "Estimating ECDFs");
549
+    nunprotect++;
550
+  }
551
+
552
+  for (int i=0; i < nr; i++) {
553
+    SEXP          xR, uniqvR;
554
+    int           nuniqv;
555
+    double*       x;
556
+    double*       uniqv;
557
+    double*       ecdfuniqv;
558
+    int           sum;
559
+    double*       e1_p;
560
+    const double* e2_p;
561
+    int*          mt;
562
+    int*          tab;
563
+    int           nnas;
564
+
565
+    if (verbose) { /* show progress */
566
+      if (i % 100 == 0 && CLI_SHOULD_TICK)
567
+        cli_progress_set(pb, i);
568
+    }
569
+
570
+    /* remove consecutive repeated elements */
571
+    PROTECT(uniqvR = allocVector(REALSXP, nc));
572
+    PROTECT(xR = allocVector(REALSXP, nc));
573
+    uniqv = REAL(uniqvR);
574
+    x = REAL(xR);
575
+    nnas = 0; /* non-NA values */
576
+    for (int j=0; j < nc; j++) {
577
+#ifdef LONG_VECTOR_SUPPORT
578
+      R_xlen_t idx = nr * j + i;
579
+#else
580
+      int idx = nr * j + i;
581
+#endif
582
+      x[j] = X[idx];
583
+      if (!ISNA(x[j])) {
584
+        uniqv[nnas] = x[j];
585
+        nnas++;
586
+      }
587
+    }
588
+
589
+    qsort(uniqv, nnas, sizeof(double), dbl_cmp);
590
+    e1_p = uniqv;
591
+    e2_p = e1_p + 1;
592
+    nuniqv = 0;
593
+    for (int j=0; j < nnas; j++) {
594
+      if (*e2_p != *e1_p) {
595
+        *(++e1_p) = *e2_p;
596
+        nuniqv++;
597
+      }
598
+      e2_p++;
599
+    }
600
+
601
+    /* match original values to sorted unique values */
602
+    /* consider adding LONG_VECTOR_SUPPORT */
603
+    mt = INTEGER(match_int(xR, uniqvR)); /* 1-based! */
604
+
605
+    /* tabulate matches */
606
+    /* consider adding LONG_VECTOR_SUPPORT */
607
+    tab = R_Calloc(nuniqv, int); /* assuming zeroes are set */
608
+    for (int j=0; j < nc; j++)
609
+      if (!ISNA(mt[j]) && mt[j] > 0 && mt[j] <= nuniqv)
610
+        tab[mt[j] - 1]++;
611
+
612
+    /* cumulative sum to calculate ecdf values */
613
+    /* consider adding LONG_VECTOR_SUPPORT */
614
+    ecdfuniqv = R_Calloc(nuniqv, double); /* assuming zeroes are set */
615
+    sum = 0;
616
+    for (int j=0; j < nuniqv; j++) {
617
+      sum = sum + tab[j];
618
+      ecdfuniqv[j] = ((double) sum) / ((double) nc);
619
+    }
620
+
621
+    /* set ecdf values on the corresponding positions
622
+     * of the output dense matrix */
623
+    ecdf_vals = REAL(ecdfRobj);
624
+
625
+    for (int j=0; j < nc; j++) {
626
+#ifdef LONG_VECTOR_SUPPORT
627
+      R_xlen_t idx = nr * j + i;
628
+#else
629
+      int idx = nr * j + i;
630
+#endif
631
+      if (!ISNA(X[idx]))
632
+        ecdf_vals[idx] = ecdfuniqv[mt[j]-1];
633
+      else
634
+        ecdf_vals[idx] = NA_REAL;
635
+    }
636
+
637
+    R_Free(ecdfuniqv);
638
+    R_Free(tab);
639
+
640
+    UNPROTECT(2); /* xR uniqvR */
641
+  }
642
+
643
+  if (verbose)
644
+    cli_progress_done(pb);
645
+
646
+  UNPROTECT(nunprotect); /* XR ecdfRobj pb */
647
+
648
+  return(ecdfRobj);
649
+}
... ...
@@ -29,6 +29,8 @@ ecdfvals_sparse_to_dense_R(SEXP XCspR, SEXP XRspR, SEXP verboseR);
29 29
 SEXP
30 30
 ecdfvals_dense_to_dense_R(SEXP XR, SEXP verboseR);
31 31
 
32
+SEXP
33
+ecdfvals_dense_to_dense_nas_R(SEXP XR, SEXP verboseR);
32 34
 
33 35
 SEXP
34 36
 order_rankstat_R(SEXP xR);
... ...
@@ -56,6 +58,7 @@ static R_CallMethodDef callMethods[] = {
56 58
   {"ecdfvals_sparse_to_sparse_R", (DL_FUNC) &ecdfvals_sparse_to_sparse_R, 3},
57 59
   {"ecdfvals_sparse_to_dense_R", (DL_FUNC) &ecdfvals_sparse_to_dense_R, 3},
58 60
   {"ecdfvals_dense_to_dense_R", (DL_FUNC) &ecdfvals_dense_to_dense_R, 2},
61
+  {"ecdfvals_dense_to_dense_nas_R", (DL_FUNC) &ecdfvals_dense_to_dense_nas_R, 2},
59 62
   {"gsva_rnd_walk_R", (DL_FUNC) &gsva_rnd_walk_R, 3},
60 63
   {"gsva_scores_genesets_R", (DL_FUNC) &gsva_score_genesets_R, 5},
61 64
   {"order_rankstat_sparse_to_dense_R", (DL_FUNC) &order_rankstat_sparse_to_dense_R, 2},