R/anova_class.R
883f7f27
 #' ANOVA model class
 #'
 #' ANOVA model class. ANOVA for all features in a dataset
 #'
 #' @import struct
 #' @import stats
 #' @import car
 #' @export ANOVA
 ANOVA<-setClass(
   "ANOVA",
   contains=c('method','stato'),
   slots=c(
     # INPUTS
     params.alpha='entity.stato',
     params.mtc='entity.stato',
     params.formula='entity',
     params.type='enum',
     # OUTPUTS
     outputs.f_statistic='entity.stato',
     outputs.p_value='entity.stato',
     outputs.significant='entity'
   ),
   prototype = list(name='Analysis of Variance',
     description='ANOVA applied to each column of a dataset.',
     type="univariate",
     predicted='p_value',
     stato.id="OBI:0200201",
 
     params.alpha=entity.stato(name='Confidence level',
       stato.id='STATO:0000053',
       value=0.05,
       type='numeric',
       description='the p-value cutoff for determining significance.'
     ),
     params.mtc=entity.stato(name='Multiple Test Correction method',
       stato.id='OBI:0200089',
       value='fdr',
       type='numeric',
       description='The method used to adjust for multiple comparisons.'
     ),
     params.type=enum(name='ANOVA type',
       description='I, II or [III]. The type of sums of squares to use. For balanced designs all types gives the same result.',
       value='III',
       type='character',
       list=c('I','II','III')
     ),
     outputs.f_statistic=entity.stato(name='F-statistic',
       stato.id='STATO:0000176',
       type='numeric',
       description='the value of the calculated statistic which is converted to a p-value when compared to an F-distribution.'
     ),
     outputs.p_value=entity.stato(name='p value',
       stato.id='STATO:0000175',
       type='numeric',
       description='the probability of observing the calculated t-statistic.'
     ),
     outputs.significant=entity(name='Significant features',
       #stato.id='STATO:0000069',
       type='logical',
       description='TRUE if the calculated p-value is less than the supplied threhold (alpha)'
     )
   )
 )
 
 #' @export
 setMethod(f="method.apply",
   signature=c("ANOVA",'dataset'),
   definition=function(M,D)
   {
     X=dataset.data(D)
     var_names=all.vars(M$formula)
 
     # attempt to detect within factors
     within=which(var_names %in% all.names(M$formula)[which('Error'== all.names(M$formula))+2])
 
     if (length(within)>0) {
       var_names_ex=var_names[-within]
     } else {
       var_names_ex=var_names
     }
 
     y=dataset.sample_meta(D)[var_names[2:length(var_names)]]
 
     # set the contrasts
     O=options('contrasts') # keep the old ones
     options(contrasts = c("contr.sum","contr.poly"))
 
     output=apply(X,2,function(x) {
       temp=y
       temp[[var_names[1]]]=x
 
       # check number levels
       temp2=na.omit(temp)
       s=sapply(var_names_ex[2:length(var_names)],function(x) summary(temp2[[x]]))
       n=nrow(temp2)
 
       # check for alias columns
       dona=FALSE
       if (all(unlist(s)>2)) { # check we have enough levels
         temp3=temp[,var_names_ex] # ignore within factors
         al=alias(params$formula,temp3) # check we have independent columns
         if ('Complete' %in% names(al)) {
           dona=TRUE
         }
       } else {
         dona=TRUE
       }
 
       # check for enough samples
       temp3=temp
       temp3[[var_names[1]]]=rnorm(nrow(y))
       LM=lm(formula=M$formula,data=temp3)
       p=nrow(summary(LM)$coefficients)
       if (n<(p+1)) {
         dona=TRUE
       }
 
       if (dona) {
         # missing values have probably prevented one or more combinations of levels being present
         # use some fake data to generate the output table then replace all the values with NA
         temp[[var_names[1]]]=rnorm(nrow(y))
         LM=lm(formula=M$formula,data=temp)
         A=Anova(LM,type=M$type)
         A[!is.na(A)]=NA
         return(A)
       }
 
       LM=lm(formula=M$formula,data=temp)
       A=Anova(LM,type=M$type)
       return(A)
     })
 
     f_statistic=sapply(output,function(x){
       x$`F value`
     })
     f_statistic=as.data.frame(t(f_statistic))
     colnames(f_statistic)=rownames(output[[1]])
     f_statistic=f_statistic[,2:(ncol(f_statistic)-1),drop=FALSE]
 
 
     p_value=sapply(output,function(x){
       x$`Pr(>F)`
     })
     p_value=as.data.frame(t(p_value))
     colnames(p_value)=rownames(output[[1]])
     p_value=p_value[,2:(ncol(p_value)-1),drop=FALSE]
 
     # fdr correct the p.values
     for (k in 1:ncol(p_value)) {
       p_value[, k]=p.adjust(p_value[ ,k],M$mtc)
     }
 
     # populate the object
     M$f_statistic=f_statistic
     M$p_value=p_value
     M$significant=p_value<M$alpha
 
     # reset the contrasts
     options(O)
 
     return(M)
   }
 )