#' ANOVA model class #' #' ANOVA model class. ANOVA for all features in a dataset #' #' @import struct #' @import stats #' @import car #' @export ANOVA ANOVA<-setClass( "ANOVA", contains=c('method','stato'), slots=c( # INPUTS params.alpha='entity.stato', params.mtc='entity.stato', params.formula='entity', params.type='enum', # OUTPUTS outputs.f_statistic='entity.stato', outputs.p_value='entity.stato', outputs.significant='entity' ), prototype = list(name='Analysis of Variance', description='ANOVA applied to each column of a dataset.', type="univariate", predicted='p_value', stato.id="OBI:0200201", params.alpha=entity.stato(name='Confidence level', stato.id='STATO:0000053', value=0.05, type='numeric', description='the p-value cutoff for determining significance.' ), params.mtc=entity.stato(name='Multiple Test Correction method', stato.id='OBI:0200089', value='fdr', type='numeric', description='The method used to adjust for multiple comparisons.' ), params.type=enum(name='ANOVA type', description='I, II or [III]. The type of sums of squares to use. For balanced designs all types gives the same result.', value='III', type='character', list=c('I','II','III') ), outputs.f_statistic=entity.stato(name='F-statistic', stato.id='STATO:0000176', type='numeric', description='the value of the calculated statistic which is converted to a p-value when compared to an F-distribution.' ), outputs.p_value=entity.stato(name='p value', stato.id='STATO:0000175', type='numeric', description='the probability of observing the calculated t-statistic.' ), outputs.significant=entity(name='Significant features', #stato.id='STATO:0000069', type='logical', description='TRUE if the calculated p-value is less than the supplied threhold (alpha)' ) ) ) #' @export setMethod(f="method.apply", signature=c("ANOVA",'dataset'), definition=function(M,D) { X=dataset.data(D) var_names=all.vars(M$formula) # attempt to detect within factors within=which(var_names %in% all.names(M$formula)[which('Error'== all.names(M$formula))+2]) if (length(within)>0) { var_names_ex=var_names[-within] } else { var_names_ex=var_names } y=dataset.sample_meta(D)[var_names[2:length(var_names)]] # set the contrasts O=options('contrasts') # keep the old ones options(contrasts = c("contr.sum","contr.poly")) output=apply(X,2,function(x) { temp=y temp[[var_names[1]]]=x # check number levels temp2=na.omit(temp) s=sapply(var_names_ex[2:length(var_names)],function(x) summary(temp2[[x]])) n=nrow(temp2) # check for alias columns dona=FALSE if (all(unlist(s)>2)) { # check we have enough levels temp3=temp[,var_names_ex] # ignore within factors al=alias(params$formula,temp3) # check we have independent columns if ('Complete' %in% names(al)) { dona=TRUE } } else { dona=TRUE } # check for enough samples temp3=temp temp3[[var_names[1]]]=rnorm(nrow(y)) LM=lm(formula=M$formula,data=temp3) p=nrow(summary(LM)$coefficients) if (n<(p+1)) { dona=TRUE } if (dona) { # missing values have probably prevented one or more combinations of levels being present # use some fake data to generate the output table then replace all the values with NA temp[[var_names[1]]]=rnorm(nrow(y)) LM=lm(formula=M$formula,data=temp) A=Anova(LM,type=M$type) A[!is.na(A)]=NA return(A) } LM=lm(formula=M$formula,data=temp) A=Anova(LM,type=M$type) return(A) }) f_statistic=sapply(output,function(x){ x$`F value` }) f_statistic=as.data.frame(t(f_statistic)) colnames(f_statistic)=rownames(output[[1]]) f_statistic=f_statistic[,2:(ncol(f_statistic)-1),drop=FALSE] p_value=sapply(output,function(x){ x$`Pr(>F)` }) p_value=as.data.frame(t(p_value)) colnames(p_value)=rownames(output[[1]]) p_value=p_value[,2:(ncol(p_value)-1),drop=FALSE] # fdr correct the p.values for (k in 1:ncol(p_value)) { p_value[, k]=p.adjust(p_value[ ,k],M$mtc) } # populate the object M$f_statistic=f_statistic M$p_value=p_value M$significant=p_value<M$alpha # reset the contrasts options(O) return(M) } )