Browse code

added function `delta_deviance_lf`

Alejandro Ochoa Garcia authored on 12/05/2021 03:45:51
Showing 8 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: gcatest
2 2
 Title: Genotype Conditional Association TEST
3
-Version: 2.0.2.9000
3
+Version: 2.0.3.9000
4 4
 Author: Wei Hao, Minsun Song, Alejandro Ochoa, John D. Storey
5 5
 Maintainer: Alejandro Ochoa <[email protected]>, Wei Hao <[email protected]>, John D. Storey <[email protected]>
6 6
 Encoding: UTF-8
... ...
@@ -1,5 +1,6 @@
1 1
 # Generated by roxygen2: do not edit by hand
2 2
 
3
+export(delta_deviance_lf)
3 4
 export(gcat)
4 5
 export(gcat.stat)
5 6
 export(gcatest)
... ...
@@ -33,4 +33,11 @@ Internally there was major code restructuring, and added unit tests for all func
33 33
 
34 34
 - Added internal tests for deviance calculations against `stats::glm`.
35 35
 - Deviance code (internal `delta_deviance_snp`) now returns `NA` instead of stopping when an "impossible" case is encountered (when the genotype `x` is non-zero but the fitted probabilities under either null or alternative model are zero, or the alternative allele dosage (`x-2`) has the same problem).
36
-  These cases are clearly model fitting failures, and can arise for common ill-defined problems, particularly under binary `adjustment` variables passed to `gcat` together with rare variants; these individual cases are not handled any better by `stats:glm`, so it seemed most sensible to return `NA` at such loci and not stop.
36
+  These cases are clearly model fitting failures, and can arise for common ill-defined problems, particularly under binary `adjustment` variables passed to `gcat` together with rare variants; these individual cases are not handled any better by `stats::glm`, so it seemed most sensible to return `NA` at such loci and not stop.
37
+
38
+# 2021-05-11 - gcatest 2.0.3.9000
39
+
40
+- Added function `delta_deviance_lf`, which calculates the delta deviance from two logistic models and the genotype matrix data.
41
+  This function is a more general version of `gcat.stat` (which uses the new function internally), to essentially consider models that differ by more than one degree of freedom.
42
+  It was written in particular for an external application in mind, namely the `jackstraw` package.
43
+- Internal function `assoc_snp` was renamed to `delta_deviance_snp_lf` and its last argument changed to match that of `delta_deviance_lf` (alternative logistic factors instead of trait).
37 44
new file mode 100644
... ...
@@ -0,0 +1,68 @@
1
+#' Calculate delta deviance of logistic fits of a null and alternative model
2
+#'
3
+#' This function fits, at each locus of a given genotype matrix, two logistic models, and under the assumption that the models are nested, calculates the delta deviance between the two.
4
+#' This general function is intended for testing models in a broad setting; for the specific problem of genetic association, the interface in [gcat()] and [gcat.stat()] are more user-friendly.
5
+#'
6
+#' @inheritParams lfa::lfa
7
+#' @param LF0 Logistic factors for null model.
8
+#' @param LF1 Logistic factors for alternative model.
9
+#'
10
+#' @return The vector of delta deviance values, one per locus of `X`.
11
+#'
12
+#' @examples
13
+#' \dontrun{
14
+#' devdiff <- delta_deviance_lf( X, LF0, LF1 )
15
+#' }
16
+#'
17
+#' @export
18
+delta_deviance_lf <- function( X, LF0, LF1 ){
19
+    if ( missing( X ) )
20
+        stop( 'Genotype matrix `X` is required!' )
21
+    if ( missing( LF0 ) )
22
+        stop( "`LF0` matrix is required!" )
23
+    if ( missing( LF1 ) )
24
+        stop( "`LF1` matrix is required!" )
25
+    
26
+    # check class
27
+    is_BEDMatrix <- FALSE
28
+    if ( "BEDMatrix" %in% class(X) ) {
29
+        is_BEDMatrix <- TRUE
30
+    } else if ( !is.matrix( X ) )
31
+        stop( '`X` must be a matrix!' )
32
+
33
+    # get dimensions
34
+    if ( is_BEDMatrix ) {
35
+        m <- ncol(X)
36
+        n <- nrow(X)
37
+    } else {
38
+        n <- ncol(X)
39
+        # m not used in this case
40
+    }
41
+
42
+    # check dimensions
43
+    if ( nrow(LF0) != n )
44
+        stop( 'Number of individuals in `X` must equal number of individuals (rows) in `LF0`' )
45
+    if ( nrow(LF1) != n )
46
+        stop( 'Number of individuals in `X` must equal number of individuals (rows) in `LF1`' )
47
+    
48
+    # check LFs for missing values
49
+    if ( anyNA( LF0 ) )
50
+        stop( '`LF0` must not have missing values!' )
51
+    if ( anyNA( LF1 ) )
52
+        stop( '`LF1` must not have missing values!' )
53
+
54
+    # start processing
55
+    if ( is_BEDMatrix ) {
56
+        # explicit loop for BEDMatrix
57
+        devdiff <- vector('numeric', m)
58
+        for ( i in 1 : m ) {
59
+            # get locus i genotype vector
60
+            xi <- X[ , i ]
61
+            # calculate and store result
62
+            devdiff[ i ] <- delta_deviance_snp_lf( xi, LF0, LF1 )
63
+        }
64
+    } else
65
+        devdiff <- apply( X, 1, delta_deviance_snp_lf, LF0, LF1 )
66
+    return( devdiff )
67
+}
68
+
0 69
similarity index 55%
1 70
rename from R/assoc_snp.R
2 71
rename to R/delta_deviance_snp_lf.R
... ...
@@ -1,35 +1,35 @@
1
-assoc_snp <- function(xi, LF, trait){
1
+delta_deviance_snp_lf <- function( xi, LF0, LF1 ){
2 2
     if ( missing( xi ) )
3 3
         stop( 'Genotype vector `xi` is required!' )
4
-    if ( missing( LF ) )
5
-        stop( "`LF` matrix is required!" )
6
-    if ( missing( trait ) )
7
-        stop( "`trait` is required!" )
4
+    if ( missing( LF0 ) )
5
+        stop( "`LF0` matrix is required!" )
6
+    if ( missing( LF1 ) )
7
+        stop( "`LF1` matrix is required!" )
8 8
 
9 9
     # check dimensions
10 10
     n <- length( xi )
11
-    if ( nrow(LF) != n )
12
-        stop( 'Number of individuals in `xi` must equal number of individuals (rows) in `LF`' )
13
-    if ( length(trait) != n )
14
-        stop( 'Number of individuals in `xi` must equal number of individuals in `trait`' )
11
+    if ( nrow(LF0) != n )
12
+        stop( 'Number of individuals in `xi` must equal number of individuals (rows) in `LF0`' )
13
+    if ( nrow(LF1) != n )
14
+        stop( 'Number of individuals in `xi` must equal number of individuals (rows) in `LF1`' )
15 15
     
16 16
     # check trait and LF for missing values
17
-    if ( anyNA( trait ) )
18
-        stop( '`trait` must not have missing values!' )
19
-    if ( anyNA( LF ) )
20
-        stop( '`LF` must not have missing values!' )
17
+    if ( anyNA( LF0 ) )
18
+        stop( '`LF0` must not have missing values!' )
19
+    if ( anyNA( LF1 ) )
20
+        stop( '`LF1` must not have missing values!' )
21 21
     
22 22
     # remove individuals whose genotypes are NA
23 23
     # though `lfa::af_snp` allows NAs, the resulting imputed allele frequencies do not contribute to the deviance (because the xi's are factors of the sum)!  So best to exclude them from the beginning.
24 24
     indexes_keep <- !is.na(xi)
25 25
     xi <- xi[indexes_keep]
26
-    LF <- LF[indexes_keep,]
27
-    trait <- trait[indexes_keep]
28
-
29
-    # perform two logistic regressions, under the null and alternative (add trait to LF/covariates), resulting in estimated allele frequencies under each model
30
-    p0 <- lfa::af_snp( xi, LF )
31
-    p1 <- lfa::af_snp( xi, cbind(LF, trait) )
32
-
26
+    LF0 <- LF0[indexes_keep,]
27
+    LF1 <- LF1[indexes_keep,]
28
+    
29
+    # perform two logistic regressions, under the null and alternative, resulting in estimated allele frequencies under each model
30
+    p0 <- lfa::af_snp( xi, LF0 )
31
+    p1 <- lfa::af_snp( xi, LF1 )
32
+    
33 33
     # now compute delta deviance
34 34
     # uses a special numerically stable algorithm
35 35
     devdiff <- delta_deviance_snp( xi, p0, p1 )
... ...
@@ -11,21 +11,13 @@ gcat.stat <- function( X, LF, trait, adjustment = NULL ){
11 11
     if ( missing( trait ) )
12 12
         stop( "`trait` is required!" )
13 13
     
14
-    # check class
15
-    is_BEDMatrix <- FALSE
14
+    # check class, get dimensions
16 15
     if ( "BEDMatrix" %in% class(X) ) {
17
-        is_BEDMatrix <- TRUE
18
-    } else if ( !is.matrix( X ) )
19
-        stop( '`X` must be a matrix!' )
20
-
21
-    # get dimensions
22
-    if ( is_BEDMatrix ) {
23
-        m <- ncol(X)
24 16
         n <- nrow(X)
25
-    } else {
17
+    } else if ( is.matrix( X ) ) {
26 18
         n <- ncol(X)
27
-        # m not used in this case
28
-    }
19
+    } else 
20
+        stop( '`X` must be a matrix!' )
29 21
 
30 22
     # check dimensions
31 23
     if ( nrow(LF) != n )
... ...
@@ -48,18 +40,14 @@ gcat.stat <- function( X, LF, trait, adjustment = NULL ){
48 40
         # if all good, add adjustments to LFs
49 41
         LF <- cbind( LF, adjustment )
50 42
     }
43
+
44
+    # bind matrices
45
+    LF1 <- cbind(LF, trait)
46
+
47
+    # this performs the calculations in a broader setting
48
+    # (shared with `jackstraw` package)
49
+    devdiff <- delta_deviance_lf( X, LF, LF1 )
51 50
     
52
-    if ( is_BEDMatrix ) {
53
-        # explicit loop for BEDMatrix
54
-        devdiff <- vector('numeric', m)
55
-        for ( i in 1 : m ) {
56
-            # get locus i genotype vector
57
-            xi <- X[ , i ]
58
-            # calculate and store result
59
-            devdiff[ i ] <- assoc_snp( xi, LF, trait )
60
-        }
61
-    } else
62
-        devdiff <- apply( X, 1, assoc_snp, LF, trait )
63 51
     return( devdiff )
64 52
 }
65 53
 
66 54
new file mode 100644
... ...
@@ -0,0 +1,31 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/delta_deviance_lf.R
3
+\name{delta_deviance_lf}
4
+\alias{delta_deviance_lf}
5
+\title{Calculate delta deviance of logistic fits of a null and alternative model}
6
+\usage{
7
+delta_deviance_lf(X, LF0, LF1)
8
+}
9
+\arguments{
10
+\item{X}{A matrix of SNP genotypes, i.e. an integer matrix of 0's,
11
+1's, 2's and \code{NA}s.
12
+BEDMatrix is supported.
13
+Sparse matrices of class Matrix are not supported (yet).}
14
+
15
+\item{LF0}{Logistic factors for null model.}
16
+
17
+\item{LF1}{Logistic factors for alternative model.}
18
+}
19
+\value{
20
+The vector of delta deviance values, one per locus of \code{X}.
21
+}
22
+\description{
23
+This function fits, at each locus of a given genotype matrix, two logistic models, and under the assumption that the models are nested, calculates the delta deviance between the two.
24
+This general function is intended for testing models in a broad setting; for the specific problem of genetic association, the interface in \code{\link[=gcat]{gcat()}} and \code{\link[=gcat.stat]{gcat.stat()}} are more user-friendly.
25
+}
26
+\examples{
27
+\dontrun{
28
+devdiff <- delta_deviance_lf( X, LF0, LF1 )
29
+}
30
+
31
+}
... ...
@@ -169,11 +169,12 @@ test_that( "delta_deviance_snp works", {
169 169
     expect_true( is.na( delta_deviance_snp( xi, pi, pi_bad2 ) ) )
170 170
 })
171 171
 
172
-test_that("assoc_snp works", {
172
+test_that("delta_deviance_snp_lf works", {
173 173
     # test a single SNP
174 174
     i <- 1
175
+    LF1 <- cbind(LFs, trait)
175 176
     expect_silent(
176
-        devdiff <- assoc_snp( X[ i, ] , LFs, trait )
177
+        devdiff <- delta_deviance_snp_lf( X[ i, ] , LFs, LF1 )
177 178
     )
178 179
     expect_equal( length( devdiff ), 1 )
179 180
     expect_true( is.numeric( devdiff ) )
... ...
@@ -188,6 +189,28 @@ test_that("assoc_snp works", {
188 189
     expect_equal( devdiff, devdiff_glm )
189 190
 })
190 191
 
192
+test_that("delta_deviance_lf works", {
193
+    LF1 <- cbind(LFs, trait)
194
+    expect_silent(
195
+        devdiff <- delta_deviance_lf( X , LFs, LF1 )
196
+    )
197
+    expect_equal( length( devdiff ), m_loci )
198
+    expect_true( is.numeric( devdiff ) )
199
+    # delta deviances can be NA if LFA/glm.fit fail to converge
200
+    expect_true( !any( is.na( devdiff ) ) )
201
+    # theoretically this is true, but in practice it depends on LFA fitting these models well, so testing this is not appropriate here (fails sometimes)
202
+    ## expect_true( all( devdiff >= 0 ) )
203
+    
204
+    # loop through `glm` version
205
+    devdiff_glm <- vector( 'numeric', m_loci )
206
+    for ( i in 1 : m_loci ) {
207
+        suppressWarnings(
208
+            devdiff_glm[i] <- delta_deviance_snp_glm( X[ i, ], LFs, trait )
209
+        )
210
+    }
211
+    expect_equal( devdiff, devdiff_glm )
212
+})
213
+
191 214
 test_that("gcat.stat works", {
192 215
     expect_silent(
193 216
         devdiff <- gcat.stat( X, LFs, trait )
... ...
@@ -253,6 +276,15 @@ if (
253 276
     # load as a BEDMatrix object
254 277
     X_BEDMatrix <- suppressMessages(suppressWarnings( BEDMatrix( file_bed ) ))
255 278
 
279
+    test_that("delta_deviance_lf works with BEDMatrix", {
280
+        LF1 <- cbind( LFs, trait )
281
+        devdiff_basic <- delta_deviance_lf( X, LFs, LF1 )
282
+        expect_silent(
283
+            devdiff_BM <- delta_deviance_lf( X_BEDMatrix, LFs, LF1 )
284
+        )
285
+        expect_equal( devdiff_basic, devdiff_BM )
286
+    })
287
+
256 288
     test_that("gcat.stat works with BEDMatrix", {
257 289
         devdiff_basic <- gcat.stat( X, LFs, trait )
258 290
         expect_silent(