883f7f27 |
#' ANOVA model class
#'
#' ANOVA model class. ANOVA for all features in a dataset
#'
#' @import struct
#' @import stats
#' @import car
#' @export ANOVA
ANOVA<-setClass(
"ANOVA",
contains=c('method','stato'),
slots=c(
# INPUTS
params.alpha='entity.stato',
params.mtc='entity.stato',
params.formula='entity',
params.type='enum',
# OUTPUTS
outputs.f_statistic='entity.stato',
outputs.p_value='entity.stato',
outputs.significant='entity'
),
prototype = list(name='Analysis of Variance',
description='ANOVA applied to each column of a dataset.',
type="univariate",
predicted='p_value',
stato.id="OBI:0200201",
params.alpha=entity.stato(name='Confidence level',
stato.id='STATO:0000053',
value=0.05,
type='numeric',
description='the p-value cutoff for determining significance.'
),
params.mtc=entity.stato(name='Multiple Test Correction method',
stato.id='OBI:0200089',
value='fdr',
type='numeric',
description='The method used to adjust for multiple comparisons.'
),
params.type=enum(name='ANOVA type',
description='I, II or [III]. The type of sums of squares to use. For balanced designs all types gives the same result.',
value='III',
type='character',
list=c('I','II','III')
),
outputs.f_statistic=entity.stato(name='F-statistic',
stato.id='STATO:0000176',
type='numeric',
description='the value of the calculated statistic which is converted to a p-value when compared to an F-distribution.'
),
outputs.p_value=entity.stato(name='p value',
stato.id='STATO:0000175',
type='numeric',
description='the probability of observing the calculated t-statistic.'
),
outputs.significant=entity(name='Significant features',
#stato.id='STATO:0000069',
type='logical',
description='TRUE if the calculated p-value is less than the supplied threhold (alpha)'
)
)
)
#' @export
setMethod(f="method.apply",
signature=c("ANOVA",'dataset'),
definition=function(M,D)
{
X=dataset.data(D)
var_names=all.vars(M$formula)
# attempt to detect within factors
within=which(var_names %in% all.names(M$formula)[which('Error'== all.names(M$formula))+2])
if (length(within)>0) {
var_names_ex=var_names[-within]
} else {
var_names_ex=var_names
}
y=dataset.sample_meta(D)[var_names[2:length(var_names)]]
# set the contrasts
O=options('contrasts') # keep the old ones
options(contrasts = c("contr.sum","contr.poly"))
output=apply(X,2,function(x) {
temp=y
temp[[var_names[1]]]=x
# check number levels
temp2=na.omit(temp)
s=sapply(var_names_ex[2:length(var_names)],function(x) summary(temp2[[x]]))
n=nrow(temp2)
# check for alias columns
dona=FALSE
if (all(unlist(s)>2)) { # check we have enough levels
temp3=temp[,var_names_ex] # ignore within factors
al=alias(params$formula,temp3) # check we have independent columns
if ('Complete' %in% names(al)) {
dona=TRUE
}
} else {
dona=TRUE
}
# check for enough samples
temp3=temp
temp3[[var_names[1]]]=rnorm(nrow(y))
LM=lm(formula=M$formula,data=temp3)
p=nrow(summary(LM)$coefficients)
if (n<(p+1)) {
dona=TRUE
}
if (dona) {
# missing values have probably prevented one or more combinations of levels being present
# use some fake data to generate the output table then replace all the values with NA
temp[[var_names[1]]]=rnorm(nrow(y))
LM=lm(formula=M$formula,data=temp)
A=Anova(LM,type=M$type)
A[!is.na(A)]=NA
return(A)
}
LM=lm(formula=M$formula,data=temp)
A=Anova(LM,type=M$type)
return(A)
})
f_statistic=sapply(output,function(x){
x$`F value`
})
f_statistic=as.data.frame(t(f_statistic))
colnames(f_statistic)=rownames(output[[1]])
f_statistic=f_statistic[,2:(ncol(f_statistic)-1),drop=FALSE]
p_value=sapply(output,function(x){
x$`Pr(>F)`
})
p_value=as.data.frame(t(p_value))
colnames(p_value)=rownames(output[[1]])
p_value=p_value[,2:(ncol(p_value)-1),drop=FALSE]
# fdr correct the p.values
for (k in 1:ncol(p_value)) {
p_value[, k]=p.adjust(p_value[ ,k],M$mtc)
}
# populate the object
M$f_statistic=f_statistic
M$p_value=p_value
M$significant=p_value<M$alpha
# reset the contrasts
options(O)
return(M)
}
)
|