R/PCA_plotfcns.R
883f7f27
 #' pca_correlation_plot class
 #'
 #' plots the correlation between features and selected components.
 #'
 #' @import struct
 #' @export pca_correlation_plot
 #' @include PCA_class.R
 pca_correlation_plot<-setClass(
   "pca_correlation_plot",
   contains='chart',
   slots=c(
     # INPUTS
     params.components='entity'
   ),
   prototype = list(name='Feature boxplot',
     description='plots a boxplot of a chosen feature for each group of a dataset.',
     type="boxlot",
     params=c('components'),
     params.components=entity(name='Components to plot',
       value=c(1,2),
       type='numeric',
       description='the components to be plotted e.g. c(1,2) plots component 1 on the x axis and component 2 on the y axis.'
     )
 
   )
 )
 
 #' @export
 setMethod(f="chart.plot",
   signature=c("pca_correlation_plot",'PCA'),
   definition=function(obj,dobj)
   {
     opt=param.list(obj)
     A=data.frame(x=output.value(dobj,'correlation')[,opt$components[1]],y=output.value(dobj,'correlation')[,opt$components[2]])
     dat <- circleFun(c(0,0),2,npoints = 50)
 
     out=ggplot(data=A,aes_(x=~x,y=~y)) +
       geom_point() +
       scale_colour_Publication() +
       theme_Publication(base_size = 12)+
       coord_fixed(xlim = c(-1,1),ylim=c(-1,1)) +
 
       geom_path(data=dat,aes_(x=~x,y=~y),inherit.aes = FALSE)
 
     return(out)
   }
 )
 
 #################################################
 #################################################
 
 #' pca_scores_plot class
 #'
 #' 2d scatter plot of princpal component scores.
 #'
 #' @import struct
 #' @export pca_scores_plot
 #' @include PCA_class.R
 pca_scores_plot<-setClass(
   "pca_scores_plot",
   contains='chart',
   slots=c(
     # INPUTS
     params.components='entity',
     params.points_to_label='enum',
     params.factor_name='entity',
     params.groups='entity',
     params.ellipse='enum'
   ),
 
   prototype = list(name='PCA scores plot',
     description='Plots a 2d scatter plot of the selected components',
     type="scatter",
     params=c('components','points_to_label','factor_name','groups'),
 
     params.components=entity(name='Components to plot',
       value=c(1,2),
       type='numeric',
       description='the components to be plotted e.g. c(1,2) plots component 1 on the x axis and component 2 on the y axis.'
     ),
 
     params.points_to_label=enum(name='points_to_label',
       value='none',
       type='character',
       description='("none"), "all", or "outliers" will be labelled on the plot.'
 
     ),
     params.factor_name=entity(name='Factor name',
       value='factor',
       type='character',
       description='The name of the factor to be displayed on the plot. Appears on axis and legend titles, for example. By default the column name of the meta data will be used where possible.'
     ),
     params.groups=entity(name='Groups',
       value=factor(),
       type='factor',
       description='The name of the factor to be displayed on the plot. Appears on axis and legend titles, for example. By default the column name of the meta data will be used where possible.'
     ),
     params.ellipse=enum(name = 'Plot ellipses',description=c(
       '"all" will plot all ellipses',
       '"group" will only plot group ellipses',
       '"none" will not plot any ellipses',
       '"sample" will plot ellipse for all samples (ignoring group)'),
       list=c('all','group','none','sample'),
       value='all')
   )
 )
 
 
 
 #' @importFrom sp point.in.polygon
 #' @import ggplot2
 #' @importFrom scales squish
 #' @export
 setMethod(f="chart.plot",
   signature=c("pca_scores_plot",'PCA'),
   definition=function(obj,dobj)
   {
 
     if (obj$points_to_label=='outliers' & !(obj$ellipse %in% c('all','sample'))) {
       warning('Outliers are only labelled when plotting the sample ellipse')
     }
 
 
     opt=param.list(obj)
 
     scores=output.value(dobj,'scores')
     pvar=(colSums(scores*scores)/output.value(dobj,'ssx'))*100 # percent variance
     pvar=round(pvar,digits = 2) # round to 2 decimal places
     shapes <- rep(19,nrow(scores)) # filled circles for all samples
     slabels <- rownames(scores)
     x=scores[,opt$components[1]]
     y=scores[,opt$components[2]]
     xlabel=paste("PC",opt$components[[1]],' (',sprintf("%.1f",pvar[opt$components[[1]]]),'%)',sep='')
     ylabel=paste("PC",opt$components[[2]],' (',sprintf("%.1f",pvar[opt$components[[2]]]),'%)',sep='')
 
     # add a space to the front of the labels to offset them from the points, because nudge_x is in data units
     for (i in 1:length(slabels))
     {
       slabels[i]=paste0('  ',slabels[i], '  ')
     }
 
     if (is(opt$groups,'factor')) {
       plotClass=pmp::createClassAndColors(opt$groups)
       opt$groups=plotClass$class
     }
 
     # build the plot
     A <- data.frame (group=opt$groups,x=x, y=y)
 
     out=ggplot (data=A, aes_(x=~x,y=~y,colour=~group,label=~slabels,shapes=~shapes)) +
 
       geom_point(na.rm=TRUE) +
       xlab(xlabel) +
       ylab(ylabel) +
       ggtitle('PCA Scores', subtitle=NULL) +
 
       if (obj$ellipse %in% c('all','group')) {
         stat_ellipse(type='norm') # ellipse for individual groups
       }
 
     if (is(opt$groups,'factor')) # if a factor then plot by group using the colours from pmp package
     {
       out=out+scale_colour_manual(values=plotClass$manual_colors,name=opt$factor_name)
     }
     else # assume continuous and use the default colour gradient
     {
       out=out+scale_colour_viridis_c(limits=quantile(opt$groups,c(0.05,0.95),na.rm = TRUE),oob=squish,name=opt$factor_name)
     }
     out=out+theme_Publication(base_size = 12)
     # add ellipse for all samples (ignoring group)
     if (obj$ellipse %in% c('all','sample')) {
       out=out+stat_ellipse(type='norm',mapping=aes(x=x,y=y),colour="#C0C0C0",linetype='dashed',data=A)
     }
 
     if (obj$ellipse %in% c('all','sample')) { # only do this if we plotted the sample ellipse
       # identify samples outside the ellipse
       build=ggplot_build(out)$data
       points=build[[1]]
       ell=build[[length(build)]]
       # outlier for dataset ellipse
       points$in.ell=as.logical(sp::point.in.polygon(points$x,points$y,ell$x,ell$y))
 
       # label outliers if
       if (opt$points_to_label=='outliers')
       {
         if (!all(points$in.ell))
         {
 
           temp=subset(points,!points$in.ell)
           temp$group=opt$groups[!points$in.ell]
           out=out+geom_text(data=temp,aes_(x=~x,y=~y,label=~label,colour=~group),vjust="inward",hjust="inward")
 
         }
       }
       # add a list of outliers to the plot object
       out$outliers=trimws(slabels[!points$in.ell])
     }
 
     # label all points if requested
     if (opt$points_to_label=='all')
     {
       out=out+geom_text(vjust="inward",hjust="inward")
     }
 
     return(out)
   }
 )
 
 #################################################################
 #################################################################
 
 #' pca_biplot_plot class
 #'
 #' 2d scatter plot of princpal component scores overlaid with principal component loadings.
 #'
 #' @import struct
 #' @export pca_biplot_plot
 #' @include PCA_class.R
 pca_biplot_plot<-setClass(
   "pca_biplot_plot",
   contains='chart',
   slots=c(
     # INPUTS
     params.components='entity',
     params.points_to_label='entity',
     params.factor_name='entity',
     params.groups='entity',
     params.sf='entity'
 
   ),
   prototype = list(name='Feature boxplot',
     description='plots a boxplot of a chosen feature for each group of a dataset.',
     type="boxlot",
     params=c('components','points_to_label','factor_name','groups'),
     params.components=entity(name='Components to plot',
       value=c(1,2),
       type='numeric',
       description='the components to be plotted e.g. c(1,2) plots component 1 on the x axis and component 2 on the y axis.'
     ),
     params.points_to_label=entity(name='points_to_label',
       value='none',
       type='character',
       description='("none"), "all", or "outliers" will be labelled on the plot.'
     ),
     params.factor_name=entity(name='Factor name',
       value='factor',
       type='character',
       description='The name of the factor to be displayed on the plot. Appears on axis and legend titles, for example. By default the column name of the meta data will be used where possible.'
     ),
     params.groups=entity(name='Groups',
       value=factor(),
       type='factor',
       description='The name of the factor to be displayed on the plot. Appears on axis and legend titles, for example. By default the column name of the meta data will be used where possible.'
     ),
     params.scale_factor=entity(name='Loadings scale factor',
       value=0.95,
       type='numeric',
       description='Scaling factor to apply to loadings. Default = 0.95.'
     )
   )
 
 )
 
 #' @export
 setMethod(f="chart.plot",
   signature=c("pca_biplot_plot",'PCA'),
   definition=function(obj,dobj)
   {
     opt=param.list(obj)
     Ts=output.value(dobj,'scores')
     pvar=(colSums(Ts*Ts)/output.value(dobj,'ssx'))*100
     pvar=round(pvar,digits = 1)
     xlabel=paste("PC",opt$components[[1]],' (',sprintf("%.1f",pvar[opt$components[[1]]]),'%)',sep='')
     ylabel=paste("PC",opt$components[[2]],' (',sprintf("%.1f",pvar[opt$components[[2]]]),'%)',sep='')
 
     P=output.value(dobj,'loadings')
     Ev=output.value(dobj,'eigenvalues')
 
     # eigenvalues were square rooted when training PCA
     Ev=Ev[,1]
     Ev=Ev^2
 
     ## unscale the scores
     #ev are the norms of scores
     Ts=as.matrix(Ts) %*% diag(1/Ev) # these are normalised scores
 
     # scale scores and loadings by alpha
     Ts=Ts %*% diag(Ev^(1-opt$scale_factor))
     P=as.matrix(P) %*% diag(Ev^(opt$scale_factor))
 
     # additionaly scale the loadings
     sf=min(max(abs(Ts[,opt$components[1]]))/max(abs(P[,opt$components[1]])),
       max(abs(Ts[,opt$components[2]]))/max(abs(P[,opt$components[2]])))
     output.value(obj,'scores')=as.data.frame(Ts) # nb object not returned, so only temporary scaling
 
     # plot
     A=data.frame("x"=P[,opt$components[1]]*sf*0.8,"y"=P[,opt$components[2]]*sf*0.8)
     SP=pca_scores_plot()
 
 
     if (opt$style=='points')
     {
       out=out+
         geom_point(data=A,inherit.aes = FALSE,color='black',mapping = aes_(x=~x,y=~y))
     }
     if (opt$style=='arrow')
     {
       out=out+
 
         geom_segment(data=A,inherit.aes = FALSE,color='black',mapping = aes_(x=~0,y=~0,xend=~x,yend=~y),arrow=arrow(length=unit(8,'points')))
 
     }
     out=out+ggtitle('PCA biplot', subtitle=NULL) +
       xlab(xlabel) + ylab(ylabel)
 
     #label features if requested
     if (opt$label_features)
     {
       vlabels=opt$feature_labels
       for (i in 1:length(opt$feature_labels))
       {
         vlabels[i]=paste0('  ',opt$feature_labels[i], '  ')
       }
       A$vlabels=vlabels
       out=out+
 
         geom_text(data=A,aes_(x=~x,y=~y,label=~vlabels),vjust="inward",hjust="inward",inherit.aes = FALSE)
 
     }
     return(out)
   }
 )
 
 
 
 
 
 
 
 
 # loadings plot
 pca_loadings_plot=function(obj,opt)
 {
   P=output.value(obj,'loadings')
   # 1D plot
   if (length(opt$components)==1)
   {
     A=data.frame("x"=1:nrow(P),"y"=P[,opt$components[1]])
     out=ggplot(data=A,aes_(x=~x,y=~y)) +
       geom_line() +
       ggtitle('PCA Loadings', subtitle=NULL) +
       xlab('Feature') +
       ylab('Loading') +
       scale_colour_Publication() +
       theme_Publication(base_size = 12)
     return(out)
   }
   # 2D plot
   if (length(opt$components)==2)
   {
     A=data.frame("x"=P[,opt$components[1]],"y"=P[,opt$components[2]])
     out=ggplot(data=A,aes_(x=~x,y=~y)) +
       geom_point() +
       ggtitle('PCA Loadings', subtitle=NULL) +
       xlab(paste0('Component ',opt$components[1])) +
       ylab(paste0('Component ',opt$components[2])) +
       scale_colour_Publication() +
       theme_Publication(base_size = 12)
     return(out)
   }
   if (length(opt$components)>2)
   {
     stop('can only plot loadings for 1 or 2 components at a time')
   }
 
 }
 
 
 #' pca_scree_plot class
 #'
 #' line plot showing percent variance and cumulative peercent variance for the computed components.
 #'
 #' @import struct
 #' @export PCA.scree
 #' @include PCA_class.R
 PCA.scree<-setClass(
   "PCA.scree",
   contains=c('chart'),
   prototype = list(name='Scree plot',
     description='plots the percent and cumulative percent variance for the calculated components',
     type="line"
   )
 )
 
 #' @export
 setMethod(f="chart.plot",
   signature=c("PCA.scree",'PCA'),
   definition=function(obj,dobj)
   {
     ## percent variance
     scores=output.value(dobj,'scores')
     pvar=(colSums(scores*scores)/output.value(dobj,'ssx'))*100
     A=data.frame("x"=1:length(pvar),"y"=c(pvar,cumsum(pvar)),"Variance"=as.factor(c(rep('Single component',length(pvar)),rep('Cumulative',length(pvar)))))
     labels=round(A$y,digits = 1)
     labels=format(labels,1)
 
     out=ggplot(data=A, aes_(x=~x,y=~y,color=~Variance)) +
       geom_line() +
       geom_point() +
       geom_text(aes_(label=~labels),color='black',vjust=0,nudge_y = 5) +
 
       ggtitle('Scree Plot', subtitle=NULL) +
       xlab('Component') +
       ylab('Variance (%)') +
       scale_colour_Publication() +
       scale_x_continuous(breaks=0:length(pvar)) +
       guides(color=guide_legend("")) +
       theme_Publication(base_size = 12)
     return(out)
   }
 )
 
 #' pca_dstat_plot class
 #'
 #' line plot showing percent variance and cumulative peercent variance for the computed components.
 #'
 #' @import struct
 #' @export PCA.dstat
 #' @include PCA_class.R
 PCA.dstat<-setClass(
   "PCA.dstat",
   contains=c('chart'),
   slots=c(params.number_components='entity',
     params.alpha='entity'),
   prototype = list(name='d-statistic plot',
     description='a bar chart of the d-statistics for samples in the input PCA model',
     type="bar",
     params.number_components=entity(value = 2,
       name = 'number of principal components',
       description = 'number of principal components to use for the plot',
       type='numeric'),
     params.alpha=entity(value=0.95,
       name='threshold for rejecting outliers',
       description='a confidence threshold for rejecting samples based on the d-statistic',
       type='numeric'),
     params=c('number_components','alpha')
   )
 )
 
 #' @export
 setMethod(f="chart.plot",
   signature=c("PCA.dstat",'PCA'),
   definition=function(obj,dobj)
   {
     opt=param.list(obj)
     a=param.value(obj,'number_components')
     scores=output.value(dobj,'scores')
     I=nrow(scores)             # number of samples
     sample_names=rownames(scores)
     scores=scores[,1:a]
     scores=as.matrix(scores)
     covT=t(scores) %*% scores  # covariance matrix
     covT=solve(covT/(I-1)) # inverse
     H=numeric(length = I)
     for (i in 1:I)
     {
       H[i]=scores[i,,drop=FALSE]%*%covT%*%t(scores[i,,drop=FALSE])
     } #leverage value
 
     # threshold at alpha
     F=qf(p = opt$alpha,df1 = a,df2=I-a)
     sf=(a*(I-1)*(I+1))/(I*(I-a))
     threshold=sf*F
     # ggplot
     df=data.frame(x=sample_names,y=H)
 
     out=ggplot(data=df, aes_(x=~x,y=~y)) +
 
       geom_bar(stat="identity") +
       geom_hline(yintercept=threshold, linetype='dashed', color='grey') +
       ggtitle('d-statistic', subtitle=paste0('Number of components = ',a)) +
       xlab('Sample') +
       ylab('d-statistic') +
       scale_x_discrete (limits = sample_names,breaks=NULL) +
       scale_colour_Publication() +
       theme_Publication(base_size = 12)
     return(out)
   }
 )
 
 
 
 
 circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
   r = diameter / 2
   tt <- seq(0,2*pi,length.out = npoints)
   xx <- center[1] + r * cos(tt)
   yy <- center[2] + r * sin(tt)
   return(data.frame(x = xx, y = yy))
 }