Browse code

preliminary attempt at using just parent data.

mnodzenski authored on 25/10/2021 18:12:57
Showing 1 changed files

... ...
@@ -78,27 +78,83 @@
78 78
 
79 79
 preprocess.genetic.data <- function(case.genetic.data, complement.genetic.data = NULL, father.genetic.data = NULL,
80 80
     mother.genetic.data = NULL, ld.block.vec = NULL, bp.param = bpparam(), snp.sampling.probs = NULL,
81
-    categorical.exposures = NULL, categorical.exposures.risk.ranks = NULL) {
81
+    categorical.exposures = NULL, categorical.exposures.risk.ranks = NULL, parents.only = FALSE) {
82 82
 
83 83
     #make sure the ld.block.vec is correctly specified
84
-    if (is.null(ld.block.vec)){
84
+    if (!parents.only){
85 85
 
86
-        ld.block.vec <- ncol(case.genetic.data)
86
+        if (is.null(ld.block.vec)){
87
+
88
+            ld.block.vec <- ncol(case.genetic.data)
89
+
90
+        } else {
91
+
92
+            if (sum(ld.block.vec) != ncol(case.genetic.data)){
93
+
94
+                stop("sum(ld.block.vec) must be equal to ncol(case.genetic.data)")
95
+
96
+            }
97
+
98
+        }
99
+
100
+        # make sure the appropriate genetic data is included
101
+        if (is.null(complement.genetic.data) & (is.null(father.genetic.data) | is.null(mother.genetic.data))) {
102
+
103
+            stop("Must include complement.genetic.data or both father.genetic.data and mother.genetic.data")
104
+
105
+        }
87 106
 
88 107
     } else {
89 108
 
90
-        if (sum(ld.block.vec) != ncol(case.genetic.data)){
109
+        if ((!is.null(father.genetic.data) | is.null(mother.genetic.data)) |
110
+            (is.null(father.genetic.data) | !is.null(mother.genetic.data))) {
91 111
 
92
-            stop("sum(ld.block.vec) must be equal to ncol(case.genetic.data)")
112
+            stop("Must include both father.genetic.data and mother.genetic.data")
93 113
 
94 114
         }
95 115
 
96
-    }
116
+        if (!is.null(case.genetic.data) & is.null(complement.genetic.data)) {
117
+
118
+            stop("If case.genetic.data is specified, must also include sibling data in argument complement.genetic.data")
119
+
120
+        }
121
+        if (is.null(categorical.exposures)){
122
+
123
+            stop("categorical.exposures must be specific for parents.only = TRUE")
124
+
125
+        }
126
+
127
+        if (!is.null(father.genetic.data) & !is.null(case.genetic.data)){
128
+
129
+            study.type <- "mix"
130
+            n.total <- nrow(father.genetic.data) + nrow(case.genetic.data)
131
+            if (length(exposure) != n.total){
132
+
133
+                stop("length(exposure) must be equal to nrow(father.genetic.data) + nrow(case.genetic.data)")
134
+
135
+            }
136
+
137
+        } else if (!is.null(father.genetic.data) & is.null(case.genetic.data)){
138
+
139
+            study.type <- "triad"
140
+            n.total <-  nrow(father.genetic.data)
141
+            if (length(exposure) != n.total){
142
+
143
+                stop("length(exposure) must be equal to nrow(father.genetic.data)")
144
+
145
+            }
146
+
147
+        } else if (is.null(father.genetic.data) & !is.null(case.genetic.data)) {
97 148
 
98
-    # make sure the appropriate genetic data is included
99
-    if (is.null(complement.genetic.data) & (is.null(father.genetic.data) | is.null(mother.genetic.data))) {
149
+            study.type <- "sibling"
150
+            n.total <-  nrow(case.genetic.data)
151
+            if (length(exposure) != n.total){
100 152
 
101
-        stop("Must include complement.genetic.data or both father.genetic.data and mother.genetic.data")
153
+                stop("length(exposure) must be equal to nrow(case.genetic.data)")
154
+
155
+            }
156
+
157
+        }
102 158
 
103 159
     }
104 160
 
... ...
@@ -167,298 +223,534 @@ preprocess.genetic.data <- function(case.genetic.data, complement.genetic.data =
167 223
 
168 224
     }
169 225
 
170
-    # check formatting of input data and, if necessary, create memory mapped files
171
-    if (!any(class(case.genetic.data) %in% c("matrix", "big.matrix"))){
226
+    if (!parents.only){
172 227
 
173
-        stop("case.genetic.data must be of class matrix or big.matrix")
228
+        # check formatting of input data and, if necessary, create memory mapped files
229
+        if (!any(class(case.genetic.data) %in% c("matrix", "big.matrix"))){
174 230
 
175
-    }
231
+            stop("case.genetic.data must be of class matrix or big.matrix")
176 232
 
177
-    if (any(class(case.genetic.data) == "matrix")){
233
+        }
178 234
 
179
-        if (!all(round(case.genetic.data) == case.genetic.data, na.rm = TRUE)){
235
+        if (any(class(case.genetic.data) == "matrix")){
180 236
 
181
-            stop("case.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
237
+            if (!all(round(case.genetic.data) == case.genetic.data, na.rm = TRUE)){
182 238
 
183
-        }
239
+                stop("case.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
240
+
241
+            }
242
+
243
+            storage.mode(case.genetic.data) <- "integer"
244
+
245
+            # convert to big.matrix
246
+            dimnames(case.genetic.data) <- NULL
247
+            case.bm <- as.big.matrix(case.genetic.data, type = "integer")
248
+            rm(case.genetic.data)
249
+
250
+        } else if (class(case.genetic.data) == "big.matrix"){
184 251
 
185
-        storage.mode(case.genetic.data) <- "integer"
252
+            if (! describe(case.genetic.data)@description$type %in% c("integer")){
186 253
 
187
-        # convert to big.matrix
188
-        dimnames(case.genetic.data) <- NULL
189
-        case.bm <- as.big.matrix(case.genetic.data, type = "integer")
190
-        rm(case.genetic.data)
254
+                stop("case.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
191 255
 
192
-    } else if (class(case.genetic.data) == "big.matrix"){
256
+            }
257
+
258
+            if (describe(case.genetic.data)@description$sharedType != "FileBacked"){
259
+
260
+                stop("case.genetic.data must be a file backed big.matrix (case.genetic.data@description$sharedType == 'FileBacked')")
193 261
 
194
-        if (! describe(case.genetic.data)@description$type %in% c("integer")){
262
+            }
195 263
 
196
-            stop("case.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
264
+            case.bm <- case.genetic.data
197 265
 
198 266
         }
199 267
 
200
-        if (describe(case.genetic.data)@description$sharedType != "FileBacked"){
268
+        if (!is.null(complement.genetic.data) & !any(class(complement.genetic.data) %in% c("matrix", "big.matrix"))){
201 269
 
202
-            stop("case.genetic.data must be a file backed big.matrix (case.genetic.data@description$sharedType == 'FileBacked')")
270
+            stop("complement.genetic.data must be of class matrix or big.matrix")
203 271
 
204 272
         }
205 273
 
206
-        case.bm <- case.genetic.data
274
+        if (!is.null(complement.genetic.data) & any(class(complement.genetic.data) == "matrix")){
207 275
 
208
-    }
276
+            if (!all(round(complement.genetic.data) == complement.genetic.data, na.rm = TRUE)){
209 277
 
210
-    if (!is.null(complement.genetic.data) & !any(class(complement.genetic.data) %in% c("matrix", "big.matrix"))){
278
+                stop("complement.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
211 279
 
212
-        stop("complement.genetic.data must be of class matrix or big.matrix")
280
+            }
213 281
 
214
-    }
282
+            storage.mode(complement.genetic.data) <- "integer"
215 283
 
216
-    if (!is.null(complement.genetic.data) & any(class(complement.genetic.data) == "matrix")){
284
+            # convert to big.matrix
285
+            dimnames(complement.genetic.data) <- NULL
286
+            comp.bm <- as.big.matrix(complement.genetic.data, type = "integer")
287
+            rm(complement.genetic.data)
217 288
 
218
-        if (!all(round(complement.genetic.data) == complement.genetic.data, na.rm = TRUE)){
289
+        } else if (!is.null(complement.genetic.data) & class(complement.genetic.data) == "big.matrix"){
219 290
 
220
-            stop("complement.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
291
+            if (! describe(complement.genetic.data)@description$type %in% c("integer")){
221 292
 
222
-        }
293
+                stop("complement.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
223 294
 
224
-        storage.mode(complement.genetic.data) <- "integer"
295
+            }
225 296
 
226
-        # convert to big.matrix
227
-        dimnames(complement.genetic.data) <- NULL
228
-        comp.bm <- as.big.matrix(complement.genetic.data, type = "integer")
229
-        rm(complement.genetic.data)
297
+            if (describe(complement.genetic.data)@description$sharedType != "FileBacked"){
230 298
 
231
-    } else if (!is.null(complement.genetic.data) & class(complement.genetic.data) == "big.matrix"){
299
+                stop("complement.genetic.data must be a file backed big.matrix (complement.genetic.data@description$sharedType == 'FileBacked')")
232 300
 
233
-        if (! describe(complement.genetic.data)@description$type %in% c("integer")){
301
+            }
234 302
 
235
-            stop("complement.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
303
+            comp.bm <- complement.genetic.data
236 304
 
237 305
         }
238 306
 
239
-        if (describe(complement.genetic.data)@description$sharedType != "FileBacked"){
307
+        if (!is.null(mother.genetic.data) & !any(class(mother.genetic.data) %in% c("matrix", "big.matrix"))){
240 308
 
241
-            stop("complement.genetic.data must be a file backed big.matrix (complement.genetic.data@description$sharedType == 'FileBacked')")
309
+            stop("mother.genetic.data must be of class matrix or big.matrix")
242 310
 
243 311
         }
244 312
 
245
-        comp.bm <- complement.genetic.data
313
+        if (!is.null(mother.genetic.data) & any(class(mother.genetic.data) == "matrix")){
246 314
 
247
-    }
315
+            if (!all(round(mother.genetic.data) == mother.genetic.data, na.rm = TRUE)){
248 316
 
249
-    if (!is.null(mother.genetic.data) & !any(class(mother.genetic.data) %in% c("matrix", "big.matrix"))){
317
+                stop("mother.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
250 318
 
251
-        stop("mother.genetic.data must be of class matrix or big.matrix")
319
+            }
252 320
 
253
-    }
321
+            storage.mode(mother.genetic.data) <- "integer"
254 322
 
255
-    if (!is.null(mother.genetic.data) & any(class(mother.genetic.data) == "matrix")){
323
+            # convert to big.matrix
324
+            dimnames(mother.genetic.data) <- NULL
325
+            mother.bm <- as.big.matrix(mother.genetic.data, type = "integer")
326
+            rm(mother.genetic.data)
256 327
 
257
-        if (!all(round(mother.genetic.data) == mother.genetic.data, na.rm = TRUE)){
328
+        } else if (!is.null(mother.genetic.data) & class(mother.genetic.data) == "big.matrix"){
258 329
 
259
-            stop("mother.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
330
+            if (! describe(mother.genetic.data)@description$type %in% c("integer")){
331
+
332
+                stop("mother.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
333
+
334
+            }
335
+
336
+            if (describe(mother.genetic.data)@description$sharedType != "FileBacked"){
337
+
338
+                stop("mother.genetic.data must be a file backed big.matrix (mother.genetic.data@description$sharedType == 'FileBacked')")
339
+
340
+            }
341
+
342
+            mother.bm <- mother.genetic.data
343
+
344
+        }
345
+
346
+        if (!is.null(father.genetic.data) & !any(class(father.genetic.data) %in% c("matrix", "big.matrix"))){
347
+
348
+            stop("father.genetic.data must be of class matrix or big.matrix")
260 349
 
261 350
         }
262 351
 
263
-        storage.mode(mother.genetic.data) <- "integer"
352
+        if (!is.null(father.genetic.data) & any(class(father.genetic.data) == "matrix")){
264 353
 
265
-        # convert to big.matrix
266
-        dimnames(mother.genetic.data) <- NULL
267
-        mother.bm <- as.big.matrix(mother.genetic.data, type = "integer")
268
-        rm(mother.genetic.data)
354
+            if (!all(round(father.genetic.data) == father.genetic.data, na.rm = TRUE)){
269 355
 
270
-    } else if (!is.null(mother.genetic.data) & class(mother.genetic.data) == "big.matrix"){
356
+                stop("father.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
271 357
 
272
-        if (! describe(mother.genetic.data)@description$type %in% c("integer")){
358
+            }
359
+            storage.mode(father.genetic.data) <- "integer"
360
+
361
+            # convert to big.matrix
362
+            dimnames(father.genetic.data) <- NULL
363
+            father.bm <- as.big.matrix(father.genetic.data, type = "integer")
364
+            rm(father.genetic.data)
365
+
366
+        } else if (!is.null(father.genetic.data) & class(father.genetic.data) == "big.matrix"){
273 367
 
274
-            stop("mother.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
368
+            if (! describe(father.genetic.data)@description$type %in% c("integer")){
369
+
370
+                stop("father.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
371
+
372
+            }
373
+
374
+            if (describe(father.genetic.data)@description$sharedType != "FileBacked"){
375
+
376
+                stop("father.genetic.data must be a file backed big.matrix (father.genetic.data@description$sharedType == 'FileBacked')")
377
+
378
+            }
379
+
380
+            father.bm <- father.genetic.data
275 381
 
276 382
         }
277 383
 
278
-        if (describe(mother.genetic.data)@description$sharedType != "FileBacked"){
384
+        # make a list of big matrix objects
385
+        if (exists("comp.bm")){
386
+
387
+            bm.list <- list(case = case.bm, complement = comp.bm)
279 388
 
280
-            stop("mother.genetic.data must be a file backed big.matrix (mother.genetic.data@description$sharedType == 'FileBacked')")
389
+        } else {
390
+
391
+            bm.list <- list(case = case.bm, mother = mother.bm, father = father.bm)
281 392
 
282 393
         }
283 394
 
284
-        mother.bm <- mother.genetic.data
395
+        # if needed compute sampling probs
396
+        if (is.null(snp.sampling.probs)){
285 397
 
286
-    }
398
+            ### use conditional logistic regression to estimate univariate association ###
399
+            n.fam <- nrow(case.bm)
400
+            n.candidate.snps <- ncol(case.bm)
401
+            case.status <- c(rep(1, n.fam), rep(0, n.fam))
402
+            ids <- rep(seq_len(n.fam), 2)
287 403
 
288
-    if (!is.null(father.genetic.data) & !any(class(father.genetic.data) %in% c("matrix", "big.matrix"))){
404
+            if (is.null(categorical.exposures)){
289 405
 
290
-        stop("father.genetic.data must be of class matrix or big.matrix")
406
+                res.list <- bplapply(seq_len(n.candidate.snps), function(snp, bm.list) {
291 407
 
292
-    }
408
+                    case.snp <- bm.list$case[ , snp]
409
+                    if (length(bm.list) == 3){
410
+
411
+                        mom.snp <- bm.list$mother[ , snp]
412
+                        dad.snp <- bm.list$father[ , snp]
413
+                        comp.snp <- mom.snp + dad.snp - case.snp
414
+
415
+                    } else {
416
+
417
+                        comp.snp <- bm.list$complement[ , snp]
418
+
419
+                    }
420
+
421
+                    # get p-value of association from conditional logistic regression
422
+                    case.comp.geno <- c(case.snp, comp.snp)
423
+                    clogit.res <- clogit(case.status ~ case.comp.geno + strata(ids), method = "approximate")
424
+                    clogit.chisq <- summary(clogit.res)$logtest[1]
425
+
426
+                    return(list(case.snp = case.snp, comp.snp = comp.snp, chisq = clogit.chisq))
427
+
428
+                }, bm.list = bm.list, BPPARAM = bp.param)
429
+                chisq.stats <- do.call("c", lapply(res.list, function(x) x$chisq))
430
+
431
+            } else {
432
+
433
+                exposure.var <- factor(rep(exposure, 2))
434
+                res.list <- bplapply(seq_len(n.candidate.snps), function(snp, bm.list, exposure.var) {
435
+
436
+                    case.snp <- bm.list$case[ , snp]
437
+                    if (length(bm.list) == 3){
438
+
439
+                        mom.snp <- bm.list$mother[ , snp]
440
+                        dad.snp <- bm.list$father[ , snp]
441
+                        comp.snp <- mom.snp + dad.snp - case.snp
442
+
443
+                    } else {
444
+
445
+                        comp.snp <- bm.list$complement[ , snp]
446
+
447
+                    }
293 448
 
294
-    if (!is.null(father.genetic.data) & any(class(father.genetic.data) == "matrix")){
449
+                    # get p-value of snp-exposure association from conditional logistic regression
450
+                    case.comp.geno <- c(case.snp, comp.snp)
451
+                    df <- data.table(case.status = case.status, case.comp.geno = case.comp.geno, exposure = exposure.var, ids = ids)
452
+                    full.model <- clogit(case.status ~ case.comp.geno + case.comp.geno:exposure + strata(ids), method = "approximate", data  = df)
453
+                    full.model.ll <- full.model$loglik[2]
454
+                    reduced.model <- clogit(case.status ~ case.comp.geno + strata(ids), method = "approximate", data  = df)
455
+                    reduced.model.ll <- reduced.model$loglik[2]
456
+                    clogit.chisq <- 2*(full.model.ll - reduced.model.ll)
295 457
 
296
-        if (!all(round(father.genetic.data) == father.genetic.data, na.rm = TRUE)){
458
+                    return(list(case.snp = case.snp, comp.snp = comp.snp, chisq = clogit.chisq))
297 459
 
298
-            stop("father.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
460
+                }, bm.list = bm.list, exposure.var = exposure.var, BPPARAM = bp.param)
461
+                chisq.stats <- do.call("c", lapply(res.list, function(x) x$chisq))
462
+
463
+            }
299 464
 
300 465
         }
301
-        storage.mode(father.genetic.data) <- "integer"
302 466
 
303
-        # convert to big.matrix
304
-        dimnames(father.genetic.data) <- NULL
305
-        father.bm <- as.big.matrix(father.genetic.data, type = "integer")
306
-        rm(father.genetic.data)
467
+        # take cumulative sum of ld.block.vec for output
468
+        out.ld.vec <- cumsum(ld.block.vec)
469
+        storage.mode(out.ld.vec) <- "integer"
470
+
471
+        #### clean up chisq stats for models that did not converge ###
472
+        chisq.stats[chisq.stats <= 0] <- 10^-10
473
+        chisq.stats[is.infinite(chisq.stats)] <- max(chisq.stats[is.finite(chisq.stats)])
474
+
475
+        ### if running GxE create required inputs ###
476
+        if (!is.null(exposure)){
307 477
 
308
-    } else if (!is.null(father.genetic.data) & class(father.genetic.data) == "big.matrix"){
478
+            if (is.null(exposure.risk.levels)){
309 479
 
310
-        if (! describe(father.genetic.data)@description$type %in% c("integer")){
480
+                exposure.risk.levels <- rep(1, length(unique(exposure)))
311 481
 
312
-            stop("father.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
482
+            } else {
483
+
484
+                exposure.risk.levels <- unlist(exposure.risk.levels[as.character(unique(exposure))])
485
+
486
+            }
487
+
488
+            exposure.levels <- unique(exposure)
489
+            storage.mode(exposure.levels) <- "integer"
490
+            storage.mode(exposure.risk.levels) <- "integer"
491
+
492
+        } else {
493
+
494
+            exposure.levels <- NULL
495
+            exposure.risk.levels <- NULL
313 496
 
314 497
         }
315 498
 
316
-        if (describe(father.genetic.data)@description$sharedType != "FileBacked"){
499
+        if (!"complement" %in% names(bm.list)){
500
+
501
+            comp.data <- mother.bm[] + father.bm[] - case.bm[]
317 502
 
318
-            stop("father.genetic.data must be a file backed big.matrix (father.genetic.data@description$sharedType == 'FileBacked')")
503
+        } else {
504
+
505
+            comp.data <- comp.bm[]
319 506
 
320 507
         }
321 508
 
322
-        father.bm <- father.genetic.data
509
+        case.data <- case.bm[]
323 510
 
324
-    }
511
+        # set missing to -9
512
+        if (any(is.na(case.data)) | any(is.na(comp.data))){
513
+
514
+            case.data[is.na(case.data) | is.na(comp.data)] <- -9
515
+            comp.data[is.na(case.data) | is.na(comp.data)] <- -9
325 516
 
326
-    # make a list of big matrix objects
327
-    if (exists("comp.bm")){
517
+        }
328 518
 
329
-        bm.list <- list(case = case.bm, complement = comp.bm)
519
+        return(list(case.genetic.data = case.data, complement.genetic.data = comp.data, chisq.stats = chisq.stats, ld.block.vec = out.ld.vec,
520
+                    exposure = exposure, exposure.levels = exposure.levels, exposure.risk.levels = exposure.risk.levels))
330 521
 
331 522
     } else {
332 523
 
333
-        bm.list <- list(case = case.bm, mother = mother.bm, father = father.bm)
524
+        # check formatting of input data and, if necessary, create memory mapped files
525
+        if (study.type %in% c("triad", "mix")){
334 526
 
335
-    }
527
+            #first check father
528
+            if (!any(class(father.genetic.data) %in% c("matrix", "big.matrix"))){
529
+
530
+                stop("father.genetic.data must be of class matrix or big.matrix")
336 531
 
337
-    # if needed compute sampling probs
338
-    if (is.null(snp.sampling.probs)){
532
+            }
339 533
 
340
-        ### use conditional logistic regression to estimate univariate association ###
341
-        n.fam <- nrow(case.bm)
342
-        n.candidate.snps <- ncol(case.bm)
343
-        case.status <- c(rep(1, n.fam), rep(0, n.fam))
344
-        ids <- rep(seq_len(n.fam), 2)
534
+            if (any(class(father.genetic.data) == "matrix")){
345 535
 
346
-        if (is.null(categorical.exposures)){
536
+                if (!all(round(father.genetic.data) == father.genetic.data, na.rm = TRUE)){
537
+
538
+                    stop("father.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
539
+
540
+                }
541
+
542
+                storage.mode(father.genetic.data) <- "integer"
347 543
 
348
-            res.list <- bplapply(seq_len(n.candidate.snps), function(snp, bm.list) {
544
+                # convert to big.matrix
545
+                dimnames(father.genetic.data) <- NULL
349 546
 
350
-                case.snp <- bm.list$case[ , snp]
351
-                if (length(bm.list) == 3){
547
+            } else if (class(father.genetic.data) == "big.matrix"){
352 548
 
353
-                    mom.snp <- bm.list$mother[ , snp]
354
-                    dad.snp <- bm.list$father[ , snp]
355
-                    comp.snp <- mom.snp + dad.snp - case.snp
549
+                if (! describe(father.genetic.data)@description$type %in% c("integer")){
356 550
 
357
-                } else {
551
+                    stop("father.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
358 552
 
359
-                    comp.snp <- bm.list$complement[ , snp]
553
+                }
554
+
555
+                if (describe(father.genetic.data)@description$sharedType != "FileBacked"){
556
+
557
+                    stop("father.genetic.data must be a file backed big.matrix (father.genetic.data@description$sharedType == 'FileBacked')")
360 558
 
361 559
                 }
362 560
 
363
-                # get p-value of association from conditional logistic regression
364
-                case.comp.geno <- c(case.snp, comp.snp)
365
-                clogit.res <- clogit(case.status ~ case.comp.geno + strata(ids), method = "approximate")
366
-                clogit.chisq <- summary(clogit.res)$logtest[1]
561
+                # convert to regular matrix
562
+                father.genetic.data <- father.genetic.data[]
367 563
 
368
-                return(list(case.snp = case.snp, comp.snp = comp.snp, chisq = clogit.chisq))
564
+            }
369 565
 
370
-            }, bm.list = bm.list, BPPARAM = bp.param)
371
-            chisq.stats <- do.call("c", lapply(res.list, function(x) x$chisq))
566
+            # then mother
567
+            if (!any(class(mother.genetic.data) %in% c("matrix", "big.matrix"))){
372 568
 
373
-        } else {
569
+                stop("mother.genetic.data must be of class matrix or big.matrix")
374 570
 
375
-            exposure.var <- factor(rep(exposure, 2))
376
-            res.list <- bplapply(seq_len(n.candidate.snps), function(snp, bm.list, exposure.var) {
571
+            }
377 572
 
378
-                case.snp <- bm.list$case[ , snp]
379
-                if (length(bm.list) == 3){
573
+            if (any(class(mother.genetic.data) == "matrix")){
380 574
 
381
-                    mom.snp <- bm.list$mother[ , snp]
382
-                    dad.snp <- bm.list$father[ , snp]
383
-                    comp.snp <- mom.snp + dad.snp - case.snp
575
+                if (!all(round(mother.genetic.data) == mother.genetic.data, na.rm = TRUE)){
384 576
 
385
-                } else {
577
+                    stop("mother.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
386 578
 
387
-                    comp.snp <- bm.list$complement[ , snp]
579
+                }
580
+
581
+                storage.mode(mother.genetic.data) <- "integer"
582
+
583
+                # convert to big.matrix
584
+                dimnames(mother.genetic.data) <- NULL
585
+
586
+            } else if (class(mother.genetic.data) == "big.matrix"){
587
+
588
+                if (! describe(mother.genetic.data)@description$type %in% c("integer")){
589
+
590
+                    stop("mother.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
388 591
 
389 592
                 }
390 593
 
391
-                # get p-value of snp-exposure association from conditional logistic regression
392
-                case.comp.geno <- c(case.snp, comp.snp)
393
-                df <- data.table(case.status = case.status, case.comp.geno = case.comp.geno, exposure = exposure.var, ids = ids)
394
-                full.model <- clogit(case.status ~ case.comp.geno + case.comp.geno:exposure + strata(ids), method = "approximate", data  = df)
395
-                full.model.ll <- full.model$loglik[2]
396
-                reduced.model <- clogit(case.status ~ case.comp.geno + strata(ids), method = "approximate", data  = df)
397
-                reduced.model.ll <- reduced.model$loglik[2]
398
-                clogit.chisq <- 2*(full.model.ll - reduced.model.ll)
594
+                if (describe(mother.genetic.data)@description$sharedType != "FileBacked"){
399 595
 
400
-                return(list(case.snp = case.snp, comp.snp = comp.snp, chisq = clogit.chisq))
596
+                    stop("mother.genetic.data must be a file backed big.matrix (mother.genetic.data@description$sharedType == 'FileBacked')")
401 597
 
402
-            }, bm.list = bm.list, exposure.var = exposure.var, BPPARAM = bp.param)
403
-            chisq.stats <- do.call("c", lapply(res.list, function(x) x$chisq))
598
+                }
599
+
600
+                # convert to regular matrix
601
+                mother.genetic.data <- mother.genetic.data[]
602
+            }
603
+
604
+            # make informativeness matrix
605
+            mom.dad.info.mat <- (mother.genetic.data == 1) + (father.genetic.data == 1)
606
+            rm(mother.genetic.data)
607
+            rm(father.genetic.data)
404 608
 
405 609
         }
406 610
 
407
-    }
611
+        # check formatting of input data and, if necessary, create memory mapped files
612
+        if (study.type %in% c("sibling", "mix")){
613
+
614
+            #first check case
615
+            if (!any(class(case.genetic.data) %in% c("matrix", "big.matrix"))){
408 616
 
409
-    # take cumulative sum of ld.block.vec for output
410
-    out.ld.vec <- cumsum(ld.block.vec)
411
-    storage.mode(out.ld.vec) <- "integer"
617
+                stop("case.genetic.data must be of class matrix or big.matrix")
412 618
 
413
-    #### clean up chisq stats for models that did not converge ###
414
-    chisq.stats[chisq.stats <= 0] <- 10^-10
415
-    chisq.stats[is.infinite(chisq.stats)] <- max(chisq.stats[is.finite(chisq.stats)])
619
+            }
416 620
 
417
-    ### if running GxE create required inputs ###
418
-    if (!is.null(exposure)){
621
+            if (any(class(case.genetic.data) == "matrix")){
419 622
 
420
-        if (is.null(exposure.risk.levels)){
623
+                if (!all(round(case.genetic.data) == case.genetic.data, na.rm = TRUE)){
421 624
 
422
-            exposure.risk.levels <- rep(1, length(unique(exposure)))
625
+                    stop("case.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
423 626
 
424
-        } else {
627
+                }
628
+
629
+                storage.mode(father.genetic.data) <- "integer"
630
+                dimnames(father.genetic.data) <- NULL
631
+
632
+            } else if (class(case.genetic.data) == "big.matrix"){
633
+
634
+                if (! describe(father.genetic.data)@description$type %in% c("integer")){
425 635
 
426
-            exposure.risk.levels <- unlist(exposure.risk.levels[as.character(unique(exposure))])
636
+                    stop("case.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
637
+
638
+                }
639
+
640
+                if (describe(case.genetic.data)@description$sharedType != "FileBacked"){
641
+
642
+                    stop("case.genetic.data must be a file backed big.matrix (case.genetic.data@description$sharedType == 'FileBacked')")
643
+
644
+                }
645
+
646
+                # convert to regular matrix
647
+                case.genetic.data <- case.genetic.data[]
648
+
649
+            }
650
+
651
+            # then complement
652
+            if (!any(class(complement.genetic.data) %in% c("matrix", "big.matrix"))){
653
+
654
+                stop("complement.genetic.data must be of class matrix or big.matrix")
655
+
656
+            }
657
+
658
+            if (any(class(complement.genetic.data) == "matrix")){
659
+
660
+                if (!all(round(complement.genetic.data) = complement.genetic.data, na.rm = TRUE)){
661
+
662
+                    stop("complement.genetic.data genotypes must be integers, not dosages imputed with uncertainty")
663
+
664
+                }
665
+
666
+                storage.mode(complement.genetic.data) <- "integer"
667
+
668
+                # convert to big.matrix
669
+                dimnames(complement.genetic.data) <- NULL
670
+
671
+            } else if (class(complement.genetic.data) == "big.matrix"){
672
+
673
+                if (! describe(complement.genetic.data)@description$type %in% c("integer")){
674
+
675
+                    stop("complement.genetic.data must be a big.matrix of type integer. To convert, see function deepcopy from package bigmemory.")
676
+
677
+                }
678
+
679
+                if (describe(complement.genetic.data)@description$sharedType != "FileBacked"){
680
+
681
+                    stop("complement.genetic.data must be a file backed big.matrix (complement.genetic.data@description$sharedType == 'FileBacked')")
682
+
683
+                }
684
+
685
+                # convert to regular matrix
686
+                complement.genetic.data <- complement.genetic.data[]
687
+
688
+            }
689
+
690
+            case.minus.comp <- abs(case.genetic.data - complement.genetic.data
691
+            case.comp.info.mat <- 2*(case.minus.comp == 2) + (case.minus.comp == 1)
692
+            rm(case.minus.comp)
693
+            rm(case.genetic.data)
694
+            rm(complement.genetic.data)
427 695
 
428 696
         }
429 697
 
430
-        exposure.levels <- unique(exposure)
431
-        storage.mode(exposure.levels) <- "integer"
432
-        storage.mode(exposure.risk.levels) <- "integer"
698
+        if (study.type == "mix"){
433 699
 
434
-    } else {
700
+            info.mat <- rbind(mom.dad.info.mat, case.comp.info.mat)
701
+            rm(mom.dad.info.mat)
702
+            rm(case.comp.info.mat)
435 703
 
436
-        exposure.levels <- NULL
437
-        exposure.risk.levels <- NULL
704
+        } else if (study.type == "triad"){
438 705
 
439
-    }
706
+            info.mat <- mom.dad.info.mat
707
+            rm(mom.dad.info.mat)
440 708
 
441
-    if (!"complement" %in% names(bm.list)){
709
+        } else if (study.type == "sibling"){
442 710
 
443
-        comp.data <- mother.bm[] + father.bm[] - case.bm[]
711
+            info.mat <- case.comp.info.mat
712
+            rm(case.comp.info.mat)
444 713
 
445
-    } else {
714
+        }
446 715
 
447
-        comp.data <- comp.bm[]
716
+        #convert to big matrix
717
+        info.bm <- as.big.matrix(info.mat, type = "integer")
718
+        rm(info.mat)
448 719
 
449
-    }
720
+        # look at informatiness/exposure association
721
+        if (is.null(snp.sampling.probs)){
450 722
 
451
-    case.data <- case.bm[]
723
+            n.candidate.snps <- ncol(info.bm)
452 724
 
453
-    # set missing to -9
454
-    if (any(is.na(case.data)) | any(is.na(comp.data))){
725
+            exposure.var <- factor(exposure)
726
+            fstats <- unlist(bplapply(seq_len(n.candidate.snps), function(snp, info.bm, exposure.var) {
455 727
 
456
-        case.data[is.na(case.data) | is.na(comp.data)] <- -9
457
-        comp.data[is.na(case.data) | is.na(comp.data)] <- -9
728
+                informativeness <- info.bm[ , snp]
458 729
 
459
-    }
730
+                # get p-value of snp-exposure association from conditional logistic regression
731
+                res <- lm(snp ~ exposure.var)
732
+                fstat <- summary(res)$fstatistic[1]
733
+                return(fstat)
734
+
735
+            }, info.bm = info.bm, exposure.var = exposure.var, BPPARAM = bp.param)
736
+
737
+        }
460 738
 
461
-    return(list(case.genetic.data = case.data, complement.genetic.data = comp.data, chisq.stats = chisq.stats, ld.block.vec = out.ld.vec,
462
-        exposure = exposure, exposure.levels = exposure.levels, exposure.risk.levels = exposure.risk.levels))
739
+        #### clean up fstats for models that did not converge ###
740
+        fstats[fstats <= 0] <- 10^-10
741
+        fstats[is.infinite(fstats)] <- max(fstats[is.finite(fstats)])
742
+        info.mat <- info.bm[]
743
+
744
+        # set missing to zero (not informative)
745
+        if (any(is.na(info.mat))){
746
+
747
+            info.mat[is.na(info.mat)] <- 0
748
+        }
749
+
750
+        return(list(case.genetic.data = NULL, complement.genetic.data = NULL, chisq.stats = fstats, ld.block.vec = NULL,
751
+                    exposure = exposure, exposure.levels = exposure.levels, exposure.risk.levels = exposure.risk.levels,
752
+                    info.mat = info.mat))
753
+
754
+    }
463 755
 
464 756
 }