Browse code

Util functions for arrays

Jean-Philippe Fortin authored on 10/02/2025 17:37:53
Showing 1 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,489 @@
1
+#' @title Add well row and column info to rowData
2
+#' 
3
+#' @description Add well row and column info to rowData
4
+#' 
5
+#' @param se A \linkS4class{SummarizedExperiment} object.
6
+#' 
7
+#' @return A \linkS4class{SummarizedExperiment} object with updated
8
+#'     annotation.
9
+#' 
10
+#' @author Jean-Philippe Fortin
11
+#' 
12
+#' @examples
13
+#' data(seExample)
14
+#' seExample <- addPlateRowAndColumn(seExample)
15
+#' 
16
+#' @importFrom stringr str_extract
17
+#' @importFrom SummarizedExperiment rowData rowData<-
18
+#' @export
19
+addPlateRowAndColumn <- function(se){
20
+    ann <- rowData(se)
21
+    if (!"Well" %in% colnames(ann)){
22
+        stop("Well must be in colData(se)")
23
+    }
24
+    ann$row    <- str_extract(rowData(se)[["Well"]], "[A-Z]+")
25
+    ann$column <- str_extract(rowData(se)[["Well"]], "[0-9]+")
26
+    ann$column <- as.integer(ann$column)
27
+    rowData(se) <- ann
28
+    return(se)
29
+}
30
+
31
+
32
+
33
+
34
+
35
+
36
+#' @title Add a column in the feature annotation for gene class
37
+#' 
38
+#' @description Add a column in the feature annotation for gene class.
39
+#' 
40
+#' @param se A \linkS4class{SummarizedExperiment} object.
41
+#' @param gene.field String specifying the name of the column
42
+#'     that contains the gene names.
43
+#' @param class.field String specifying the name of the column
44
+#'     that will store gene class. Default is "class".
45
+#' @param default.class What should be the default class for
46
+#'     genes not specified in \code{classes}?
47
+#'     Default is "target". 
48
+#' @param classes Named list specifying the gene classes.
49
+#'     Names of the list correspond to the names of the classes,
50
+#'     each each list element contains the gene names corresponding
51
+#'     the the given class.
52
+#' 
53
+#' @return A \linkS4class{SummarizedExperiment} object with updated
54
+#'     annotation.
55
+#' 
56
+#' @author Jean-Philippe Fortin
57
+#' 
58
+#' @importFrom stringr str_extract
59
+#' @importFrom SummarizedExperiment rowData rowData<-
60
+#' @export
61
+addGeneClassColumn <- function(se,
62
+                               gene.field="Gene",
63
+                               class.field="class",
64
+                               default.class="target",
65
+                               classes=NULL
66
+){
67
+    ann <- rowData(se)
68
+    ann[[class.field]] <- default.class
69
+    if (!gene.field %in% colnames(ann)){
70
+        stop("gene.field must be in colnames(rowData(se))")
71
+    }
72
+    if (!is.null(classes)){
73
+        n_classes <- length(classes)
74
+        for (class in names(classes)){
75
+            wh <- which(ann[[gene.field]] %in% classes[[class]])
76
+            ann[[class.field]][wh] <- class
77
+        }
78
+    }
79
+    rowData(se) <- ann
80
+    return(se)
81
+}
82
+
83
+
84
+
85
+
86
+#' @title log2 transformation of assay data
87
+#' 
88
+#' @description log2 transformation of assay data.
89
+#' 
90
+#' @param se A \linkS4class{SummarizedExperiment} object.
91
+#' @param assay Integer vector specifying the indices of the assay
92
+#'     to log2 transform. If NULL (default), all assays
93
+#'     are log2 trasnformed.
94
+#' @param offset Integer specifying offset to use 
95
+#'     in the log2 transformation.
96
+#' 
97
+#' @return A \linkS4class{SummarizedExperiment} object with 
98
+#'     one or many assays log2 transformed.
99
+#' 
100
+#' @examples
101
+#' data(seExample)
102
+#' seExample <- logTransform(seExample, offset=1)
103
+#' 
104
+#' @author Jean-Philippe Fortin
105
+#' 
106
+#' @importFrom stringr str_extract
107
+#' @importFrom SummarizedExperiment rowData rowData<-
108
+#' @importFrom SummarizedExperiment assays assays<-
109
+#' @export
110
+logTransform <- function(se,
111
+                         assay=NULL,
112
+                         offset=1
113
+){
114
+    n_assays <- length(assays(se))
115
+    if (is.null(assay)){
116
+        assaysToTransform <- seq_len(n_assays)
117
+    } else {
118
+        assaysToTransform <- assay
119
+    }
120
+    
121
+    for (assay in assaysToTransform){
122
+        assays(se)[[assay]] <- log2(assays(se)[[assay]]+offset)
123
+    }
124
+    return(se)
125
+}
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+#' @title Z score transformation using negative controls.
136
+#' 
137
+#' @description Z score transformation using negative controls.
138
+#' 
139
+#' @param se A \linkS4class{SummarizedExperiment} object.
140
+#' @param neg.controls Character vector specifying genes that should
141
+#'     be used for z-score transformation. The genes must be present
142
+#'     in the \code{gene.field} column of \code{rowData(se)}.
143
+#'     If NULL (default), all genes will be considered.
144
+#' @param gene.field String specifying the name of the column
145
+#'     that contains the gene names specified by \code{neg.controls}.
146
+#' @param fun.centering String specifying the method 
147
+#'     to use for centering. Either "median" (default) or "mean".
148
+#' @param fun.scaling String specifying the method
149
+#'     to use for scaling. Either "mad" (default) or "sd".
150
+#' @param scaling.ignore.controls Should the scaling factors be
151
+#'     calculated using all genes? TRUE by default, which
152
+#'     will ignore negative controls.
153
+#' @param class.field String specifying the name of the column
154
+#'     that stored the gene class in \code{rowData(se)}.
155
+#' @param class.levels Gene classes that should be considered
156
+#'     for z-score transformation. If NULL (default), all classes
157
+#'     are considered. If specified, intersection of the specified 
158
+#'     gene classes and the genes specified in \code{neg.controls}
159
+#'     will be considered for z scoring.
160
+#' 
161
+#' @return A \linkS4class{SummarizedExperiment} object with 
162
+#'     assays z-score transformed. 
163
+#' 
164
+#' @author Jean-Philippe Fortin
165
+#' 
166
+#' @importFrom matrixStats colMedians colMads colSds
167
+#' @export
168
+transformToZScores <- function(se,
169
+                               neg.controls=NULL,
170
+                               gene.field="Gene",
171
+                               fun.centering=c("median", "mean"),
172
+                               fun.scaling=c("mad", "sd"),
173
+                               scaling.ignore.controls=TRUE,
174
+                               class.field="class",
175
+                               class.levels="target"
176
+){
177
+    fun.centering <- match.arg(fun.centering)
178
+    fun.scaling   <- match.arg(fun.scaling)
179
+    n_assays <- length(assays(se))
180
+    ann <- rowData(se)
181
+    n_genes <- nrow(se)
182
+    for (assay in seq_len(n_assays)){
183
+        Y <- assays(se)[[assay]]
184
+        wh_centering <- seq_len(n_genes)
185
+        wh_scaling   <- seq_len(n_genes)
186
+        wh_class     <- seq_len(n_genes)
187
+        if (!is.null(neg.controls)){
188
+            wh_centering <- which(ann[[gene.field]] %in% neg.controls)
189
+            if (!scaling.ignore.controls){
190
+                wh_scaling <- which(ann[[gene.field]] %in% neg.controls)
191
+            } 
192
+        }
193
+        if (!is.null(class.levels)){
194
+            wh_class <- which(ann[[class.field]] %in% class.levels) 
195
+        }
196
+        wh_centering <- intersect(wh_centering, wh_class)
197
+        wh_scaling   <- intersect(wh_scaling, wh_class)
198
+            
199
+    
200
+        if (length(wh_centering)==0 | length(wh_scaling)==0 ){
201
+            stop("Combination of neg.controls and class.levels returns
202
+                  0 gene for transformation.")
203
+        }
204
+
205
+        #Centering:
206
+        if (fun.centering=="median"){
207
+            factors <- colMedians(Y[wh_centering,,drop=FALSE], na.rm=TRUE)
208
+        } else {
209
+            factors <- colMeans(Y[wh_centering,,drop=FALSE], na.rm=TRUE)
210
+        }
211
+        Y <- sweep(Y, 2 ,factors, "-")
212
+        
213
+        #Scaling:
214
+        if (fun.scaling=="mad"){
215
+            factors <- colMads(Y[wh_scaling,,drop=FALSE], na.rm=TRUE)
216
+        } else {
217
+            factors <- colSds(Y[wh_scaling,,drop=FALSE], na.rm=TRUE)
218
+        }
219
+        Y <- sweep(Y, 2 ,factors, "/")
220
+        assays(se)[[assay]] <- Y
221
+    }
222
+    return(se)
223
+}
224
+
225
+
226
+
227
+#' @title Calculate limma statistics 
228
+#' 
229
+#' @description Calculate limma statistics.
230
+#' 
231
+#' @param se A \linkS4class{SummarizedExperiment} object.
232
+#' @param design Design matrix to pass to limma.
233
+#' @param contrast.matrix Contrast matrix to pass to limma.
234
+#' 
235
+#' @return For each assay in \code{se}, it returns a named
236
+#'     list with the following elements:
237
+#'     - lfc: log-fold changes
238
+#'     - pval: p-lvaues
239
+#'     - fdr: FDR-adjusted p-values. 
240
+#'     
241
+#' 
242
+#' @examples 
243
+#' data(seExample)
244
+#' library(SummarizedExperiment)
245
+#' 
246
+#' classes <- list(neg=c("OR5M9","OR6N1"),
247
+#'     pos=c("KRAS","NRAS"),
248
+#'     rnaimax=c("RNAiMAX"),
249
+#'     empty="empty")
250
+#' seExample <- addGeneClassColumn(seExample, classes=classes)
251
+#' 
252
+#' # Transforming the data:
253
+#' seExample <- logTransform(seExample, offset=1)
254
+#' seExample <- normalizeBetweenPlates(seExample)
255
+#' seExample <- transformToZScores(seExample)
256
+#' 
257
+#' # Creating design matrix:
258
+#' pheno  <- as.factor(colData(seExample)$Condition)
259
+#' pheno  <- relevel(pheno, ref="WT")
260
+#' design <- model.matrix(~pheno-1)
261
+#' colnames(design) <- gsub("pheno", "", colnames(design))
262
+#' contrasts <- c(Mutant = "Mutant",
263
+#'     Wildtype="WT",
264
+#'     Mutant_vs_Wildtype="Mutant - WT")
265
+#' contrast.matrix <- cbind("Wildtype"=c(1,0),
266
+#'     "Mutant"=c(0,1),
267
+#'     "Mutant_vs_Wildtype"=c(-1,1))
268
+#' 
269
+#' # Calculate statistics:
270
+#' stats <- calculateStatistics(seExample,
271
+#'     design=design,
272
+#'     contrast.matrix=contrast.matrix)
273
+#' 
274
+#' @author Jean-Philippe Fortin
275
+#' 
276
+#' @importFrom limma lmFit contrasts.fit eBayes
277
+#' @importFrom stats p.adjust
278
+#' @export
279
+calculateStatistics <- function(se,
280
+                                design,
281
+                                contrast.matrix=NULL
282
+){
283
+    n_assays <- length(assays(se))
284
+    results <- list()
285
+    for (assay in seq_len(n_assays)){
286
+        Y <- assays(se)[[assay]]
287
+        fit <- lmFit(Y, design)
288
+        if (!is.null(contrast.matrix)){
289
+            fit <- contrasts.fit(fit, contrast.matrix)
290
+        }
291
+        fit <- eBayes(fit)
292
+        stats <- list()
293
+        stats[["lfc"]]  <- fit$coefficients
294
+        stats[["pval"]] <- fit$p.value
295
+        stats[["fdr"]]  <- apply(stats[["pval"]], 2, p.adjust, "fdr")
296
+        results[[assay]] <- stats
297
+    }
298
+    names(results) <- names(assays(se))
299
+    return(results)
300
+}
301
+
302
+
303
+
304
+#' @title Between-plate normalization
305
+#' 
306
+#' @description Between-plate normalization.
307
+#' 
308
+#' @param se A \linkS4class{SummarizedExperiment} object.
309
+#' @param neg.controls Character vector specifying genes that should
310
+#'     be used for estimating normalizatin factors.
311
+#'     The genes must be present in the \code{gene.field}
312
+#'     column of \code{rowData(se)}.
313
+#'     If NULL (default), all genes will be considered.
314
+#' @param fun String specifying the method 
315
+#'     to use for normalization. Either "median" (default) or "mean".
316
+#' @param plate.field String specifying the name of the column
317
+#'     that contains the plate ID.
318
+#' @param gene.field String specifying the name of the column
319
+#'     that contains the gene names specified by \code{neg.controls}.
320
+#' @param class.field String specifying the name of the column
321
+#'     that stored the gene class in \code{rowData(se)}.
322
+#' @param class.levels Gene classes that should be considered
323
+#'     for estimating normalization factors. If NULL (default), all classes
324
+#'     are considered. If specified, intersection of the specified 
325
+#'     gene classes and the genes specified in \code{neg.controls}
326
+#'     will be considered for z scoring.
327
+#' 
328
+#' @return A \linkS4class{SummarizedExperiment} object with 
329
+#'     value normalized between plates. 
330
+#' 
331
+#' @examples 
332
+#' data(seExample)
333
+#' seExample <- logTransform(seExample, offset=1)
334
+#' 
335
+#' classes <- list(neg=c("OR5M9","OR6N1"),
336
+#'     pos=c("KRAS","NRAS"),
337
+#'     rnaimax=c("RNAiMAX"),
338
+#'     empty="empty")
339
+#' seExample <- addGeneClassColumn(seExample, classes=classes)
340
+#' 
341
+#' seExample <- normalizeBetweenPlates(seExample)
342
+#' 
343
+#' @author Jean-Philippe Fortin
344
+#' 
345
+#' @importFrom matrixStats colMedians
346
+#' @importFrom SummarizedExperiment rowData assays assays<-
347
+#' @export
348
+normalizeBetweenPlates <- function(se,
349
+                                   neg.controls=NULL,
350
+                                   fun=c("median", "mean"),
351
+                                   plate.field="Plate",
352
+                                   gene.field="Gene",
353
+                                   class.field="class",
354
+                                   class.levels=c("target")
355
+){
356
+    fun <- match.arg(fun)
357
+    n_assays  <- length(assays(se))
358
+    assaysToNormalize <- seq_len(n_assays)
359
+    ann    <- rowData(se)
360
+    plate  <- ann[[plate.field]]
361
+    plates <- unique(plate)
362
+    n_plates <- length(plates)
363
+    n_genes <- nrow(se)
364
+    if (!gene.field %in% colnames(ann)){
365
+        stop("gene.field not found in colnames(rowData(se))")
366
+    }
367
+    if (!class.field %in% colnames(ann)){
368
+        stop("class.field not found in colnames(rowData(se))")
369
+    }
370
+    for (assay in assaysToNormalize){
371
+        Y <- assays(se)[[assay]]
372
+        for (k in seq_len(n_plates)){
373
+            if (is.null(neg.controls)){
374
+                wh_control <- seq_len(n_genes)
375
+            } else {
376
+                wh_control <- which(ann[[gene.field]] %in% neg.controls)
377
+            }
378
+            wh_plate <- which(plate==plates[[k]])
379
+            wh_norm  <- intersect(wh_control, wh_plate)
380
+
381
+            # Only normalizing using target genes:
382
+            if (!is.null(class.levels)){
383
+                wh_class <- which(ann[[class.field]] %in% class.levels) 
384
+            } else {
385
+                wh_class <- seq_len(n_genes)
386
+            }
387
+            wh_norm <- intersect(wh_norm, wh_class)
388
+            if (fun=="median"){
389
+                factors <- colMedians(Y[wh_norm,,drop=FALSE], na.rm=TRUE)
390
+            } else {
391
+                factors <- colMeans(Y[wh_norm,,drop=FALSE], na.rm=TRUE)
392
+            }
393
+            Y[wh_plate,] <- sweep(Y[wh_plate,,drop=FALSE], 2 ,factors, "-")
394
+        }
395
+        assays(se)[[assay]] <- Y
396
+    }
397
+    return(se)
398
+}
399
+
400
+
401
+
402
+#' @title Within-plate normalization
403
+#' 
404
+#' @description Within-plate normalization
405
+#' 
406
+#' @param se A \linkS4class{SummarizedExperiment} object.
407
+#' @param fun String specifying the method 
408
+#'     to use for normalization. Either "median" (default) or "mean".
409
+#' @param plate.field String specifying the name of the column
410
+#'     that contains the plate ID.
411
+#' @param what String specifying if plate rows or columns
412
+#'     should be normalized. "row" by default.
413
+#' @param class.field String specifying the name of the column
414
+#'     that stored the gene class in \code{rowData(se)}.
415
+#' @param class.levels Gene classes that should be considered
416
+#'     for estimating normalization factors. If NULL (default), all classes
417
+#'     are considered. If specified, intersection of the specified 
418
+#'     gene classes and the genes specified in \code{neg.controls}
419
+#'     will be considered for z scoring.
420
+#' 
421
+#' @return A \linkS4class{SummarizedExperiment} object with 
422
+#'     value normalized within plates.
423
+#' 
424
+#' #' @examples 
425
+#' data(seExample)
426
+#' seExample <- logTransform(seExample, offset=1)
427
+#' seExample <- normalizeWithinPlate(seExample)
428
+#' 
429
+#' @author Jean-Philippe Fortin
430
+#' @importFrom SummarizedExperiment rowData assays assays<-
431
+#' @importFrom matrixStats colMedians
432
+#' @export
433
+normalizeWithinPlate <- function(se,
434
+                                 fun=c("median", "mean"),
435
+                                 what=c("row", "column"),
436
+                                 plate.field="Plate",
437
+                                 class.field="class",
438
+                                 class.levels=c("target")
439
+){
440
+    fun  <- match.arg(fun)
441
+    what <- match.arg(what)
442
+    n_assays  <- length(assays(se))
443
+    ann <- rowData(se)
444
+    if (!what %in% colnames(ann)){
445
+        se  <- addPlateRowAndColumn(se)
446
+        ann <- rowData(se)
447
+    }
448
+    plate  <- ann[[plate.field]]
449
+    slice  <- paste0(plate, "_", ann[[what]])
450
+    slices <- unique(slice)
451
+    n_slices <- length(slices)
452
+    n_genes <- nrow(se)
453
+    for (assay in seq_len(n_assays)){
454
+        Y <- assays(se)[[assay]]
455
+        for (k in seq_len(n_slices)){
456
+            wh_control <- seq_len(n_genes)
457
+            wh_slice   <- which(slice==slices[[k]])
458
+            wh_norm <- intersect(wh_control, wh_slice)
459
+            # Only normalizing using target genes:
460
+            if (!is.null(class.levels)){
461
+                wh_class <- which(ann[[class.field]] %in% class.levels) 
462
+            } else {
463
+                wh_class <- seq_len(n_genes)
464
+            }
465
+            wh_norm <- intersect(wh_norm, wh_class)
466
+            if (fun=="median"){
467
+                factors <- colMedians(Y[wh_norm,,drop=FALSE], na.rm=TRUE)
468
+            } else {
469
+                factors <- colMeans(Y[wh_norm,,drop=FALSE], na.rm=TRUE)
470
+            }
471
+            Y[wh_slice,] <- sweep(Y[wh_slice,,drop=FALSE], 2 ,factors, "-")
472
+        }
473
+        assays(se)[[assay]] <- Y
474
+    }
475
+    return(se)
476
+}
477
+
478
+
479
+
480
+
481
+
482
+
483
+
484
+
485
+
486
+
487
+
488
+
489
+