Browse code

testing diff tests

Chen Zuo authored on 09/11/2014 18:39:53
Showing 3 changed files

... ...
@@ -1,6 +1,10 @@
1 1
 # Generated by roxygen2 (4.0.2): do not edit by hand
2 2
 
3 3
 export(ComputeMotifScore)
4
+export(ComputePValues)
4 5
 export(LoadMotifLibrary)
5 6
 export(LoadSNPData)
7
+import(Rcpp)
8
+import(data.table)
9
+import(doMC)
6 10
 useDynLib(MotifAnalysis)
7 11
new file mode 100644
... ...
@@ -0,0 +1,253 @@
1
+library(MotifAnalysis)
2
+library(testthat)
3
+data(example)
4
+
5
+trans_mat <- matrix(rep(snpInfo$prior, each = 4), nrow = 4)
6
+test_pwm <- motif_library$matrix[[1]]
7
+scores <- as.matrix(motif_scores[motif == names(motif_library$matrix)[1], list(log_lik_ref, log_lik_snp)])
8
+
9
+test_score <- test_pwm
10
+for(i in seq(nrow(test_score))) {
11
+  for(j in seq(ncol(test_score))) {
12
+    test_score[i, j] <- exp(mean(log((test_pwm[i, -j]) / test_pwm[i, j])))
13
+  }
14
+}
15
+
16
+## these are functions for this test only
17
+drawonesample <- function(theta) {
18
+  delta <- snpInfo$prior * t(test_score) ^ theta
19
+  prob_start <- rev(apply(delta, 2, sum))
20
+  sample <- sample(1:4, 19, replace = TRUE, prob = snpInfo$prior)
21
+  id <- sample(seq(10), 1, prob = prob_start)
22
+  sample[10] <- sample(1:4, 1, prob = delta[, 11 - id])
23
+  sample <- c(sample,
24
+              id,
25
+              test_score[11 - id, sample[10]]
26
+              )
27
+  return(sample)
28
+}
29
+jointprob <- function(x) prod(test_pwm[cbind(seq(10), x)])
30
+maxjointprob <- function(x) {
31
+  maxp <- -Inf
32
+  p <- -Inf
33
+  for(i in 1:10) {
34
+    p <- jointprob(x[i:(i+9)])
35
+    if(p > maxp)
36
+      maxp <- p
37
+  }
38
+  for(i in 1:10) {
39
+    p <- jointprob(5 - x[(i+9):i])
40
+    if(p > maxp)
41
+      maxp <- p
42
+  }
43
+  return(maxp)
44
+}
45
+get_freq <- function(sample) {
46
+  emp_freq <- matrix(0, nrow = 19, ncol = 4)
47
+  for(i in seq(19)) {
48
+    for(j in seq(4)) {
49
+      emp_freq[i, j] <- sum(sample[i, ] == j - 1)
50
+    }
51
+  }
52
+  emp_freq <- emp_freq / apply(emp_freq, 1, sum)
53
+  return(emp_freq)
54
+}
55
+
56
+test_that("Error: quantile function computing are not equivalent.", {
57
+  for(p in c(1, 10, 50, 90, 99) / 100) {
58
+    delta <- .Call("test_find_percentile_diff", scores, p, package = "MotifAnalysis")
59
+    delta.r <- as.double(sort(apply(scores, 1, function(x) -abs(diff(x))))[as.integer(p * nrow(scores)) + 1])
60
+    expect_equal(delta, delta.r)
61
+  }
62
+})
63
+
64
+test_that("Error: the scores for samples are not equivalent.", {
65
+  
66
+  p <- 0.1
67
+  delta <- .Call("test_find_percentile_diff", scores, p, package = "MotifAnalysis")
68
+  theta <- .Call("test_find_theta_diff", test_score, snpInfo$prior, snpInfo$transition, delta, package = "MotifAnalysis")
69
+
70
+  ## Use R code to generate a random sample
71
+  for(i in seq(10)) {
72
+    sample <- drawonesample(theta)
73
+    sample_score <- .Call("test_compute_sample_score_diff", test_pwm, test_score, sample[seq(19)] - 1, sample[20] - 1, package = "MotifAnalysis")
74
+    expect_equal(sample[21], exp(sample_score[1]))
75
+
76
+    sample1 <- sample2 <- sample3 <- sample
77
+    sample1[10] <- seq(4)[-sample[10]][1]
78
+    sample2[10] <- seq(4)[-sample[10]][2]
79
+    sample3[10] <- seq(4)[-sample[10]][3]
80
+    sample_score_r <- log(maxjointprob(sample[seq(19)])) -
81
+      log(c(maxjointprob(sample1[seq(19)]),
82
+            maxjointprob(sample2[seq(19)]),
83
+            maxjointprob(sample3[seq(19)])))
84
+    expect_equal(-sample_score_r, sample_score[-1])
85
+  }
86
+  
87
+  ## Use C code to generate a random sample
88
+  for(i in seq(10)) {
89
+    delta <- matrix(0, nrow = 40, ncol = 10)
90
+    for(i in seq(10)) {
91
+      delta[4 * (i - 1) + seq(4), 10] <- snpInfo$prior * test_score[11 - i, ] ^ theta
92
+    }
93
+    for(i in 1:10) {
94
+      delta[4 * (i- 1) + seq(4), seq(9)] <- sum(snpInfo$prior * delta[4 * (i - 1) + seq(4), 10])
95
+    }
96
+    sample <- .Call("test_importance_sample", delta, snpInfo$prior, trans_mat, test_pwm, theta, package = "MotifAnalysis")
97
+    start_pos <- sample[20] + 1
98
+    adj_score <- test_score[11 - start_pos, sample[10] + 1]
99
+    sample_score <- .Call("test_compute_sample_score_diff", test_pwm, test_score, sample[seq(19)], sample[20], package = "MotifAnalysis")
100
+    expect_equal(adj_score, exp(sample_score[1]))
101
+  }
102
+  
103
+})
104
+
105
+test_that("Error: compute the normalizing constant.", {
106
+  
107
+  ## parameters
108
+  p <- 0.99
109
+  delta <- .Call("test_find_percentile_diff", scores, p, package = "MotifAnalysis")
110
+  theta <- .Call("test_find_theta_diff", test_score, snpInfo$prior, snpInfo$transition, delta, package = "MotifAnalysis")
111
+  
112
+  ##
113
+  const <- .Call("test_func_delta_diff", test_score, snpInfo$prior, trans_mat, theta, package = "MotifAnalysis")
114
+  const.r <- sum(snpInfo$prior * t(test_score) ^ theta) / 10
115
+  expect_equal(const, const.r)
116
+    
117
+})
118
+
119
+test_that("Error: sample distributions are not expected.", {
120
+  
121
+  ## parameters
122
+  p <- 0.1
123
+  delta <- .Call("test_find_percentile_diff", scores, p, package = "MotifAnalysis")
124
+  theta <- .Call("test_find_theta_diff", test_score, snpInfo$prior, snpInfo$transition, delta, package = "MotifAnalysis")
125
+  delta <- matrix(0, nrow = 40, ncol = 10)
126
+  for(i in seq(10)) {
127
+    delta[4 * (i - 1) + seq(4), 10] <- test_score[11 - i, ] ^ theta
128
+  }
129
+  for(i in 1:10) {
130
+    delta[4 * (i- 1) + seq(4), seq(9)] <- sum(snpInfo$prior * delta[4 * (i - 1) + seq(4), 10])
131
+  }
132
+
133
+  target_freq <- matrix(snpInfo$prior, nrow = 4, ncol = 19)
134
+  mat <- snpInfo$prior * t(test_score) ^ theta
135
+  wei <- apply(mat, 2, sum)
136
+  mat <- mat / rep(apply(mat, 2, sum), each = 4)
137
+  mat <- mat * rep(wei, each = 4)
138
+  target_freq[, 10] <- apply(mat, 1, mean)
139
+  target_freq <- t(target_freq)
140
+  target_freq <- target_freq / apply(target_freq, 1, sum)
141
+  
142
+  registerDoMC(4)
143
+  
144
+  results <- foreach(i = seq(20)) %dopar% {
145
+
146
+    ## generate 1000 samples
147
+    sample1 <- sapply(seq(1000), function(x)
148
+                     .Call("test_importance_sample_diff",
149
+                           delta, snpInfo$prior, trans_mat, test_score, theta, package = "MotifAnalysis"))
150
+    emp_freq1 <- get_freq(sample1)
151
+    
152
+    sample2 <- sapply(rep(theta, 1000), drawonesample)
153
+    emp_freq2 <- get_freq(sample2 - 1)
154
+
155
+    print(rbind(emp_freq1[10, ], emp_freq2[10, ], target_freq[10, ]))
156
+    max(abs(emp_freq1 - target_freq)) > max(abs(emp_freq2 - target_freq))
157
+    
158
+  }
159
+  sum(unlist(results))
160
+
161
+  print(pbinom(sum(unlist(results)), size = 20, prob = 0.5))
162
+  
163
+})
164
+
165
+## Visual checks
166
+if(FALSE) {
167
+  
168
+  plot(log(y <- sapply(seq(200) / 100 - 1, function(x)
169
+            .Call("test_func_delta_diff", test_score, snpInfo$prior, snpInfo$transition, x, package = "MotifAnalysis"))))
170
+
171
+## test the theta
172
+
173
+  p_values_9 <- .Call("test_p_value_diff", test_pwm, test_score, snpInfo$prior, snpInfo$transition, scores, 0.1, package = "MotifAnalysis")
174
+  p_values_8 <- .Call("test_p_value_diff", test_pwm, test_score, snpInfo$prior, snpInfo$transition, scores, 0.2, package = "MotifAnalysis")
175
+
176
+  score_diff <- apply(scores, 1, function(x) abs(diff(x)))
177
+  
178
+  par(mfrow = c(1, 2))
179
+  plot(log(p_values_9) ~ score_diff)
180
+  plot(log(p_values_8) ~ score_diff)
181
+  
182
+  p_values_9 <- .Call("test_p_value_diff", test_pwm, test_score, snpInfo$prior, trans_mat, scores, 0.1, package = "MotifAnalysis")
183
+  p_values_8 <- .Call("test_p_value_diff", test_pwm, test_score, snpInfo$prior, trans_mat, scores, 0.2, package = "MotifAnalysis")
184
+  
185
+  pval_test <- function(x) {
186
+    delta <- .Call("test_find_percentile_diff", scores, x, package = "MotifAnalysis")
187
+    theta <- .Call("test_find_theta_diff", test_score, snpInfo$prior, trans_mat, delta, package = "MotifAnalysis")
188
+    const <- sum(snpInfo$prior * t(test_score) ^ theta) / 10
189
+    message("Constant value: ", const)
190
+    sample <- sapply(rep(theta, 1000), drawonesample)
191
+    pr <- apply(sample[1:19, ], 2, maxjointprob)
192
+    sample1 <- sample2 <- sample3 <- sample
193
+    sample1[10, ] <- sapply(sample1[10, ], function(x) seq(4)[-x][1])
194
+    sample2[10, ] <- sapply(sample2[10, ], function(x) seq(4)[-x][2])
195
+    sample3[10, ] <- sapply(sample3[10, ], function(x) seq(4)[-x][3])
196
+    pr1 <- apply(sample1[1:19, ], 2, maxjointprob)
197
+    pr2 <- apply(sample2[1:19, ], 2, maxjointprob)
198
+    pr3 <- apply(sample3[1:19, ], 2, maxjointprob)
199
+
200
+    log_diff <- log(pr) - log(c(pr1, pr2, pr3)) 
201
+    
202
+    wei <- rep(const / sample[21, ] ^ theta, 3)
203
+    message("Mean weight: ", mean(wei))
204
+    message("Mean diff score: ", mean(log_diff))
205
+    message("Target score: ", mean(log(sample[21, ])))
206
+    pval <- sapply(score_diff, function(x) sum(rep(wei, each = 3)[log_diff >= x]) / length(log_diff))
207
+    return(pval)
208
+  }
209
+
210
+  pval_test1 <- function(x) {
211
+    delta <- .Call("test_find_percentile_diff", scores, x, package = "MotifAnalysis")
212
+    theta <- .Call("test_find_theta_diff", test_score, snpInfo$prior, trans_mat, delta, package = "MotifAnalysis")
213
+    const <- sum(snpInfo$prior * t(test_score) ^ theta) / 10
214
+    message("Constant value: ", const)
215
+    log_diff <- rep(0, 3000)
216
+    wei <- rep(0, 1000)
217
+    for(i in seq(1000)) {
218
+      sample <- drawonesample(theta)
219
+      sample_score <- .Call("test_compute_sample_score_diff", test_pwm, test_score, sample[seq(19)] - 1, sample[20] - 1, package = "MotifAnalysis")
220
+      log_diff[seq(3) + 3 * (i - 1)] <- sample_score[-1]
221
+      wei[i] <- const / exp(sample_score[1] * theta)
222
+    }
223
+    message("Mean weight: ", mean(wei))
224
+    message("Mean diff score: ", mean(log_diff))
225
+
226
+    pval <- sapply(score_diff, function(x) sum(rep(wei, each = 3)[log_diff >= x]) / length(log_diff))
227
+    return(pval)
228
+  }
229
+
230
+
231
+  pval_8 <- pval_test(0.2)
232
+  pval1_8 <- pval_test1(0.2)
233
+
234
+  min(pval_8)
235
+  min(pval1_8)
236
+
237
+  p_values_8 <- 
238
+  
239
+  pval1_9 <- pval_test1(0.1)
240
+  
241
+  plot(log(pval_8), log(p_values_8))
242
+  abline(0,1)
243
+  
244
+  plot(log(pval1_8), log(p_values_8))
245
+  abline(0,1)
246
+  
247
+  plot(log(pval_9), log(p_values_9))
248
+  abline(0,1)
249
+
250
+  plot(log(pval1_9), log(p_values_9))
251
+  abline(0,1)
252
+
253
+}
... ...
@@ -211,23 +211,27 @@ if(FALSE) {
211 211
     delta <- .Call("test_find_percentile_diff", scores, x, package = "MotifAnalysis")
212 212
     theta <- .Call("test_find_theta_diff", test_score, snpInfo$prior, trans_mat, delta, package = "MotifAnalysis")
213 213
     const <- sum(snpInfo$prior * t(test_score) ^ theta) / 10
214
-    log_diff <- rep(0, 30000)
215
-    wei <- rep(0, 10000)
216
-    for(i in seq(10000)) {
214
+    message("Constant value: ", const)
215
+    log_diff <- rep(0, 3000)
216
+    wei <- rep(0, 1000)
217
+    for(i in seq(1000)) {
217 218
       sample <- drawonesample(theta)
218 219
       sample_score <- .Call("test_compute_sample_score_diff", test_pwm, test_score, sample[seq(19)] - 1, sample[20] - 1, package = "MotifAnalysis")
219 220
       log_diff[seq(3) + 3 * (i - 1)] <- sample_score[-1]
220 221
       wei[i] <- const / exp(sample_score[1] * theta)
221 222
     }
223
+    message("Mean weight: ", mean(wei))
224
+    message("Mean diff score: ", mean(log_diff))
225
+    message("Target score: ", mean(log(sample[21, ])))
226
+
222 227
     pval <- sapply(score_diff, function(x) sum(rep(wei, each = 3)[log_diff >= x]) / length(log_diff))
223 228
     return(pval)
224 229
   }
225 230
 
226 231
 
227 232
   pval_8 <- pval_test(0.2)
228
-  pval_9 <- pval_test(0.1)
229
-
230 233
   pval1_8 <- pval_test1(0.2)
234
+  
231 235
   pval1_9 <- pval_test1(0.1)
232 236
   
233 237
   plot(log(pval_8), log(p_values_8))