Browse code

allow user to set intercept type

MikeDMorgan authored on 11/09/2024 14:48:33
Showing 4 changed files

... ...
@@ -15,6 +15,11 @@
15 15
 #' @param geno.only A logical value that flags the model to use either just the \code{matrix} `Kin` or the supplied random effects.
16 16
 #' @param solver a character value that determines which optimisation algorithm is used for the variance components. Must be either
17 17
 #' HE (Haseman-Elston regression) or Fisher (Fisher scoring).
18
+#' @param intercept.type A character scalar, either \emph{fixed} or \emph{random} that sets the type of the global
19
+#' intercept variable in the model. This only applies to the GLMM case where additional random effects variables are
20
+#' already included. Setting \code{intercept.type="fixed"} or \code{intercept.type="random"} will require the user to
21
+#' test their model for failures with each. In the case of using a kinship matrix, \code{intercept.type="fixed"} is
22
+#' set automatically.
18 23
 #'
19 24
 #' @details
20 25
 #' This function runs a negative binomial generalised linear mixed effects model. If mixed effects are detected in testNhoods,
... ...
@@ -81,6 +86,7 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
81 86
                                       init.sigma=NULL, init.beta=NULL,
82 87
                                       init.u=NULL, solver=NULL),
83 88
                     dispersion = 1, geno.only=FALSE,
89
+                    intercept.type="fixed",
84 90
                     solver=NULL){
85 91
 
86 92
     if(!is.null(solver)){
... ...
@@ -273,8 +279,12 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
273 279
             diag(eyeN) <- 1
274 280
             errMat <- (e0 %*% t(e0))/sig0 - eyeN
275 281
 
276
-            exp.levels <- random.levels
277
-            exp.levels <- exp.levels[!grepl(exp.levels, pattern="residual")]
282
+            if(intercept.type == "random"){
283
+                exp.levels <- random.levels
284
+                exp.levels <- exp.levels[!grepl(exp.levels, pattern="residual")]
285
+            } else{
286
+                exp.levels <- random.levels
287
+            }
278 288
 
279 289
             curr_sigma.vec <- sapply(exp.levels, FUN=function(lvls, bigZ, errVec){
280 290
                 ijZ <- bigZ[, lvls]
... ...
@@ -291,8 +301,11 @@ fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL,
291 301
             }, bigZ=full.Z, errVec=errMat)
292 302
 
293 303
             # add the residual variance
294
-            res.var <- abs(sig0 - sum(curr_sigma.vec))
295
-            curr_sigma.vec <- c(curr_sigma.vec, res.var)
304
+            if(intercept.type == "random"){
305
+                res.var <- abs(sig0 - sum(curr_sigma.vec))
306
+                curr_sigma.vec <- c(curr_sigma.vec, res.var)
307
+            }
308
+
296 309
             curr_sigma <- Matrix(abs(curr_sigma.vec), ncol=1, sparse=TRUE)
297 310
             # curr_sigma = Matrix(rep(var(y)/(ncol(Z) + 2), ncol(Z)), ncol=1, sparse=TRUE) # split evenly
298 311
             # curr_sigma <- Matrix(runif(ncol(Z), 0, 1), ncol=1, sparse = TRUE)
... ...
@@ -55,6 +55,11 @@
55 55
 #' these should have the same \code{length} as \code{nrow} of \code{nhoodCounts}. If numeric, then these are assumed
56 56
 #' to correspond to indices of \code{nhoodCounts} - if the maximal index is greater than \code{nrow(nhoodCounts(x))}
57 57
 #' an error will be produced.
58
+#' @param intercept.type A character scalar, either \emph{fixed} or \emph{random} that sets the type of the global
59
+#' intercept variable in the model. This only applies to the GLMM case where additional random effects variables are
60
+#' already included. Setting \code{intercept.type="fixed"} or \code{intercept.type="random"} will require the user to
61
+#' test their model for failures with each. In the case of using a kinship matrix, \code{intercept.type="fixed"} is
62
+#' set automatically.
58 63
 #' @param fail.on.error A logical scalar the determines the behaviour of the error reporting. Used for debugging only.
59 64
 #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the arguments for parallelisation. By default
60 65
 #' this will evaluate using \code{SerialParam()}. See \code{details}on how to use parallelisation in \code{testNhoods}.
... ...
@@ -163,10 +168,15 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
163 168
                        min.mean=0, model.contrasts=NULL, robust=TRUE, reduced.dim="PCA", REML=TRUE,
164 169
                        norm.method=c("TMM", "RLE", "logMS"), cell.sizes=NULL,
165 170
                        max.iters = 50, max.tol = 1e-5, glmm.solver=NULL,
166
-                       subset.nhoods=NULL, fail.on.error=FALSE, BPPARAM=SerialParam(),
167
-                       force=FALSE){
171
+                       subset.nhoods=NULL, intercept.type=c("fixed", "random"),
172
+                       fail.on.error=FALSE, BPPARAM=SerialParam(), force=FALSE){
168 173
     is.lmm <- FALSE
169 174
     geno.only <- FALSE
175
+
176
+    if(!any(intercept.type %in% c("fixed", "random"))){
177
+        stop("intercept.type: ", intercept.type, " not recognised, must be either 'fixed' or 'random'")
178
+    }
179
+
170 180
     if(is(design, "formula")){
171 181
         # parse to find random and fixed effects - need to double check the formula is valid
172 182
         parse <- unlist(strsplit(gsub(design, pattern="~", replacement=""), split= "+", fixed=TRUE))
... ...
@@ -185,14 +195,25 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
185 195
             is.lmm <- TRUE
186 196
             if(find_re | is.null(kinship)){
187 197
                 # make model matrices for fixed and random effects
188
-                z.model <- .parse_formula(design, design.df, vtype="re")
198
+                if(length(intercept.type) > 1){
199
+                    intercept.type <- intercept.type[1]
200
+                }
201
+
202
+                if(intercept.type == "random"){
203
+                    rand.int <- TRUE
204
+                } else{
205
+                    rand.int <- FALSE
206
+                }
207
+
208
+                z.model <- .parse_formula(design, design.df, vtype="re", add.int=rand.int)
189 209
                 rownames(z.model) <- rownames(design.df)
190 210
             } else if(find_re & !is.null(kinship)){
191 211
                 if(!all(rownames(kinship) == rownames(design.df))){
192 212
                     stop("Genotype rownames do not match design.df rownames")
193 213
                 }
194 214
 
195
-                z.model <- .parse_formula(design, design.df, vtype="re")
215
+                # genetic model MUST have fixed intercept
216
+                z.model <- .parse_formula(design, design.df, vtype="re", add.int=FALSE)
196 217
                 rownames(z.model) <- rownames(design.df)
197 218
             } else if(!find_re & !is.null(kinship)){
198 219
                 z.model <- diag(nrow(kinship))
... ...
@@ -203,7 +224,13 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
203 224
 
204 225
             # this will always implicitly include a fixed intercept term - perhaps
205 226
             # this shouldn't be the case?
206
-            x.model <- .parse_formula(design, design.df, vtype="fe")
227
+            if(intercept.type == "fixed"){
228
+                fixed.int <- TRUE
229
+            } else{
230
+                fixed.int <- FALSE
231
+            }
232
+
233
+            x.model <- .parse_formula(design, design.df, vtype="fe", add.int=fixed.int)
207 234
             rownames(x.model) <- rownames(design.df)
208 235
             max.iters <- max.iters
209 236
 
... ...
@@ -459,22 +486,24 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
459 486
 
460 487
         #wrapper function is the same for all analyses
461 488
         glmmWrapper <- function(Y, disper, Xmodel, Zmodel, off.sets, randlevels,
462
-                                reml, glmm.contr, genonly=FALSE, kin.ship=NULL,
489
+                                reml, glmm.contr, int.type, genonly=FALSE, kin.ship=NULL,
463 490
                                 BPPARAM=BPPARAM, error.fail=FALSE){
464 491
             #bp.list <- NULL
465 492
             # this needs to be able to run with BiocParallel
466 493
             bp.list <- bptry({bplapply(seq_len(nrow(Y)), BPOPTIONS=bpoptions(stop.on.error = error.fail),
467 494
                                          FUN=function(i, Xmodel, Zmodel, Y, off.sets,
468 495
                                                       randlevels, disper, genonly,
469
-                                                      kin.ship, glmm.contr, reml){
496
+                                                      kin.ship, glmm.contr, reml, int.type){
470 497
                                              fitGLMM(X=Xmodel, Z=Zmodel, y=Y[i, ], offsets=off.sets,
471 498
                                                      random.levels=randlevels, REML = reml,
472 499
                                                      dispersion=disper[i], geno.only=genonly,
473
-                                                     Kin=kinship, glmm.control=glmm.contr)
500
+                                                     Kin=kinship, glmm.control=glmm.contr,
501
+                                                     intercept.type=int.type)
474 502
                                              }, BPPARAM=BPPARAM,
475 503
                                          Xmodel=Xmodel, Zmodel=Zmodel, Y=Y, off.sets=off.sets,
476 504
                                          randlevels=randlevels, disper=disper, genonly=genonly,
477
-                                         kin.ship=kin.ship, glmm.cont=glmm.cont, reml=reml)
505
+                                         kin.ship=kin.ship, glmm.cont=glmm.cont, reml=reml,
506
+                                       int.type=intercept.type)
478 507
                                 }) # need to handle this output which is a bplist_error object
479 508
 
480 509
             # parse the bplist_error object
... ...
@@ -512,26 +541,14 @@ testNhoods <- function(x, design, design.df, kinship=NULL,
512 541
             } else{
513 542
                 message("Running genetic model with ", nrow(z.model), " observations")
514 543
             }
515
-
516
-            if(geno.only){
517
-                fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
518
-                                   off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
519
-                                   genonly = geno.only, kin.ship=kinship,
520
-                                   BPPARAM=BPPARAM, error.fail=fail.on.error)
521
-            } else{
522
-                fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
523
-                                   off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
524
-                                   genonly = geno.only, kin.ship=kinship,
525
-                                   BPPARAM=BPPARAM, error.fail=fail.on.error)
526
-            }
527
-
528
-        } else{
529
-            fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
530
-                               off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
531
-                               genonly = geno.only, kin.ship=kinship,
532
-                               BPPARAM=BPPARAM, error.fail=fail.on.error)
533 544
         }
534 545
 
546
+        fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model,
547
+                           off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont,
548
+                           genonly = geno.only, kin.ship=kinship,
549
+                           BPPARAM=BPPARAM, error.fail=fail.on.error,
550
+                           int.type=intercept.type)
551
+
535 552
         # give warning about how many neighborhoods didn't converge and error if > 50% nhoods failed
536 553
         n.nhoods <- length(fit)
537 554
         half.n <- floor(n.nhoods * 0.5)
... ...
@@ -66,7 +66,7 @@
66 66
 
67 67
 # parse design formula
68 68
 #' @export
69
-.parse_formula <- function(in.form, design.df, vtype=c("re", "fe")){
69
+.parse_formula <- function(in.form, design.df, vtype=c("re", "fe"), add.int=FALSE){
70 70
     ## parse the formula and return the X and Z matrices
71 71
     # need to decide on how to handle intercept terms - i.e. FE or RE
72 72
     sp.form <- unlist(strsplit(as.character(in.form),
... ...
@@ -83,9 +83,13 @@
83 83
                                           function(x) as.integer(factor(x)))), ncol = length(v.terms))
84 84
         }
85 85
 
86
-        # add the residual variance term
87
-        d.mat <- cbind(d.mat, matrix(data=1L, nrow=nrow(d.mat), ncol=1))
88
-        colnames(d.mat) <- c(trimws(v.terms), "residual")
86
+        # add the residual variance term if appropriate
87
+        if(isTRUE(add.int)){
88
+            d.mat <- cbind(d.mat, matrix(data=1L, nrow=nrow(d.mat), ncol=1))
89
+            colnames(d.mat) <- c(trimws(v.terms), "residual")
90
+        } else{
91
+            colnames(d.mat) <- trimws(v.terms)
92
+        }
89 93
 
90 94
     } else if(vtype %in% c("fe")){
91 95
         v.terms <- trimws(unlist(sp.form[!grepl(trimws(sp.form), pattern="~|\\|")]))
... ...
@@ -93,8 +97,15 @@
93 97
             v.terms <- paste(v.terms, collapse=" + ")
94 98
         }
95 99
 
100
+        # the intercept is a fixed effect in this model
96 101
         d.mat <- model.matrix(as.formula(paste("~ 1 +", v.terms)), data = design.df)
97
-        d.mat <- d.mat[ ,!grepl("1*\\|", colnames(d.mat))] # drop the intercept term
102
+        d.mat <- d.mat[ ,!grepl("1*\\|", colnames(d.mat))] # drop the random terms
103
+
104
+        if(isFALSE(add.int)){
105
+            # drop the intercept term if required.
106
+            d.mat[, !grepl("Intercept", colnames(d.mat)), drop=FALSE]
107
+        }
108
+
98 109
     } else{
99 110
         stop("vtype ", vtype, " not recognised")
100 111
     }
... ...
@@ -167,7 +167,7 @@ set.seed(42)
167 167
 rownames(pbmc.design) <- pbmc.design$ObsID
168 168
 
169 169
 da_results <- testNhoods(pbmc.milo, design = ~ adjmfc.time + (1|tenx_lane), design.df = pbmc.design, fdr.weighting="graph-overlap",
170
-                         glmm.solver="Fisher", REML=TRUE, norm.method="TMM", BPPARAM = bpparam, fail.on.error=FALSE)
170
+                         glmm.solver="Fisher", REML=TRUE, norm.method="TMM", BPPARAM = bpparam, fail.on.error=FALSE, intercept.type="fixed")
171 171
 table(da_results$SpatialFDR < 0.1)
172 172
 ```
173 173
 
... ...
@@ -219,7 +219,7 @@ the results table when using the GLM.
219 219
 We can inspect the distribution of the variance parameter estimates, and compare between the converged vs. not converged nhoods.
220 220
 
221 221
 ```{r}
222
-ggplot(da_results, aes(x=Converged, y=`tenx_lane variance`)) +
222
+ggplot(da_results, aes(x=Converged, y=`tenx_lane_variance`)) +
223 223
     geom_boxplot() + 
224 224
     scale_y_log10() +
225 225
     NULL