R/PCA_class.R
883f7f27
 #' PCA model class
 #'
 #' Principal Component Analysis (PCA) model class. This object can be used to train/apply PCA mdoels to dataset objects.
 #'
 #' @import struct
 #' @export PCA
 
 PCA<-setClass(
   "PCA",
   contains=c('model','stato'),
   slots=c(
           # INPUTS
           params.number_components='entity.stato',
 
           # OUTPUTS
           outputs.scores='entity',
           outputs.loadings='data.frame',
           outputs.eigenvalues='data.frame',
           outputs.ssx='numeric',
           outputs.correlation='data.frame',
           outputs.that='dataset'
   ),
   prototype = list(name='Principal Component Analysis (PCA)',
                    description='PCA is a multivariate data reduction technique. It summarises the data in a smaller number of Principal Components that describe the maximum variation present in the dataset.',
                    type="preprocessing",
                    params=c('number_components'),
                    outputs=c('scores',
                              'loadings',
                              'eigenvalues',
                              'ssx',
                              'correlation',
                              'that'
                              ),
                    predicted='that',
                    stato.id="OBI:0200051",
 
                    params.number_components=entity.stato(name='Number of PCs',
                                                          stato.id='STATO:0000555',
                                                          value=2,
                                                          type='numeric'
                                                          ),
                    outputs.scores=entity('name'='PCA scores matrix',
                                          'description'='A matrix of PCA scores where each column corresponds to a Principal Component')
   )
 )
 
 #' @export
 setMethod(f="model.train",
           signature=c("PCA",'dataset'),
           definition=function(M,D)
           {
             A=param.value(M,'number_components')
             X=as.matrix(dataset.data(D))
             model=svd(X,A,A)
             if (A==1)
             {
               scores=model$u * model$d[A]
             } else {
               scores=model$u %*% diag(model$d[1:A])
             }
 
             scores=as.data.frame(scores)
             rownames(scores)=rownames(X)
             varnames=rep('A',1,A)
             for (i in 1:A)
             {
               varnames[i]=paste0('PC',i)
             }
             colnames(scores)=varnames
             output.value(M,'scores')=scores
 
             P=as.data.frame(model$v)
             rownames(P)=colnames(X)
             colnames(P)=varnames
             output.value(M,'loadings')=P
 
             E=data.frame('Eigenvalues'=sqrt(model$d[1:A]))
             rownames(E)=varnames
             output.value(M,'eigenvalues')=E
 
             output.value(M,'ssx')=sum(X*X) # sum of squares of x
             output.value(M,'correlation')=as.data.frame(cor(X,scores))
 
             return(M)
           }
 )
 
 #' @export
 setMethod(f="model.predict",
           signature=c("PCA",'dataset'),
           definition=function(M,D)
           {
             A=param.value(M,'number_components')
             X=as.matrix(dataset.data(D))
             P=output.value(M,'loadings')
             that=X%*%as.matrix(P)
 
             that=as.data.frame(that)
             rownames(that)=rownames(X)
             varnames=rep('A',1,A)
             for (i in 1:A)
             {
               varnames[i]=paste0('PC',i)
             }
             colnames(that)=varnames
 
             # convert to dataset for preprocessing output
             S=D
             dataset.data(S)=that
             output.value(M,'that')=S
 
             return(M)
           }
 )