#' @eval get_description('pca_correlation_plot') #' @import struct #' @export pca_correlation_plot #' @include scatter_chart_class.R PCA_class.R #' @examples #' C = pca_correlation_plot() pca_correlation_plot = function(components=c(1,2),...) { out=struct::new_struct('pca_correlation_plot', components=components, ...) return(out) } .pca_correlation_plot<-setClass( "pca_correlation_plot", contains='chart', slots=c( # INPUTS components='entity' ), prototype = list(name='PCA correlation plot', description=paste0( 'A plot of the correlation between the variables/features and ', 'the selected principal component scores. Features with high ', 'correlation are well represented by the selected component(s)'), type="boxlot", .params=c('components'), components=entity(name='Components to plot', value=c(1,2), type='numeric', description='The Principal Components used to generate the plot.', max_length=2 ) ) ) #' @export #' @template chart_plot setMethod(f="chart_plot", signature=c("pca_correlation_plot",'PCA'), definition=function(obj,dobj) { opt=param_list(obj) A=data.frame(x=output_value(dobj,'correlation')[,opt$components[1]],y=output_value(dobj,'correlation')[,opt$components[2]]) dat <- circleFun(c(0,0),2,npoints = 50) out=ggplot(data=A,aes_(x=~x,y=~y)) + geom_point() + scale_colour_Publication() + theme_Publication(base_size = 12)+ coord_fixed(xlim = c(-1,1),ylim=c(-1,1)) + geom_path(data=dat,aes_(x=~x,y=~y),inherit.aes = FALSE) return(out) } ) ################################################# ################################################# #' @eval get_description('pca_scores_plot') #' @import struct #' @export pca_scores_plot #' @include PCA_class.R #' @examples #' D = iris_DatasetExperiment() #' M = mean_centre() + PCA() #' M = model_apply(M,D) #' C = pca_scores_plot(factor_name = 'Species') #' chart_plot(C,M[2]) pca_scores_plot = function( xcol='PC1', ycol='PC2', points_to_label='none', factor_name, ellipse='all', ellipse_type='norm', ellipse_confidence=0.95, label_filter=character(0), label_factor='rownames', label_size=3.88, components=NULL, ...) { if (!is.null(components)){ xcol=components[1] ycol=components[2] } out=struct::new_struct('pca_scores_plot', xcol=xcol, ycol=ycol, points_to_label=points_to_label, factor_name=factor_name, ellipse=ellipse, label_filter=label_filter, label_factor=label_factor, label_size=label_size, ellipse_type=ellipse_type, ellipse_confidence=ellipse_confidence, components = components, ...) return(out) } .pca_scores_plot<-setClass( "pca_scores_plot", contains='scatter_chart', slots=c(components='entity'), prototype = list(name='PCA scores plot', description='Plots a 2d scatter plot of the selected components', components = entity(name='Components to plot', value=NULL, type=c('numeric','integer','NULL'), description='The principal components used to generate the plot. If provided this parameter overrides xcol and ycol params.', max_length=2 ), .params='components' ) ) #' @importFrom sp point.in.polygon #' @import ggplot2 #' @importFrom scales squish #' @export #' @template chart_plot setMethod(f="chart_plot", signature=c("pca_scores_plot",'PCA'), definition=function(obj,dobj) { # if provided convert index to names if (is.numeric(obj$xcol)) { obj$xcol=colnames(dobj$scores)[obj$xcol] } if (is.numeric(obj$ycol)) { obj$ycol=colnames(dobj$scores)[obj$ycol] } # percent variance scores=output_value(dobj,'scores')$data pvar = (colSums(scores*scores)/output_value(dobj,'ssx'))*100 # percent variance pvar = round(pvar,digits = 2) # round to 2 decimal places # axis labels pc_idx = which(colnames(dobj$scores)==obj$xcol) pc_idy = which(colnames(dobj$scores)==obj$ycol) xlabel=paste("PC",pc_idx,' (',sprintf("%.1f",pvar[pc_idx]),'%)',sep='') ylabel=paste("PC",pc_idy,' (',sprintf("%.1f",pvar[pc_idy]),'%)',sep='') # copy inputs to scatter chart C = scatter_chart() opt=param_list(obj) opt$components=NULL # exclude as not in scatter chart param_list(C) = opt # plot g = chart_plot(C,dobj$scores) # update axis labels g = g + xlab(xlabel) +ylab(ylabel) +ggtitle('PCA scores') return(g) } ) ################################################################# ################################################################# #' @eval get_description('pca_biplot') #' @import struct #' @export pca_biplot #' @include PCA_class.R #' @examples #' C = pca_biplot(factor_name='Species') pca_biplot = function( components=c(1,2), points_to_label='none', factor_name, scale_factor=0.95, style='points', label_features=FALSE, ...) { out=struct::new_struct('pca_biplot', components=components, points_to_label=points_to_label, factor_name=factor_name, scale_factor=scale_factor, style=style, label_features=label_features, ...) return(out) } .pca_biplot<-setClass( "pca_biplot", contains=c('chart'), slots=c( # INPUTS components='entity', points_to_label='entity', factor_name='entity', scale_factor='entity', style='enum', label_features='entity' ), prototype = list(name='PCA biplot', description=paste0('A scatter plot of the selected principal ', 'component scores overlaid with the corresponding principal ', 'component loadings.'), type="boxlot", .params=c('components','points_to_label','factor_name','scale_factor','style','label_features'), components=entity(name='Components to plot', value=c(1,2), type='numeric', description='The principal components used to generate the plot.', max_length=2 ), points_to_label=enum(name='points_to_label', value='none', type='character', description=c( "none" = 'No samples are labelled on the plot.', "all" = 'All samples are labelled on the plot.', "outliers" = 'Potential outliers are labelled on the plot.'), allowed=c('none','all','outliers') ), factor_name=ents$factor_name, scale_factor=entity(name='Loadings scale factor', value=0.95, type='numeric', description='The scaling factor applied to the loadings.' ), style=enum(name='Plot style', value='points', type='character', description=c( "points" = 'Loadings and scores are plotted as a scatter plot.', 'arrows' = 'The loadings are plotted as arrow vectors.'), allowed=c('points','arrows') ), label_features=entity(name='Add feature labels', value=FALSE, type='logical', description=c( "TRUE" = 'Features are labelled.', "FALSE" = "Features are not labelled.") ) ) ) #' @export #' @template chart_plot setMethod(f="chart_plot", signature=c("pca_biplot",'PCA'), definition=function(obj,dobj) { opt=param_list(obj) Ts=output_value(dobj,'scores')$data pvar=(colSums(Ts*Ts)/output_value(dobj,'ssx'))*100 pvar=round(pvar,digits = 1) xlabel=paste("PC",opt$components[[1]],' (',sprintf("%.1f",pvar[opt$components[[1]]]),'%)',sep='') ylabel=paste("PC",opt$components[[2]],' (',sprintf("%.1f",pvar[opt$components[[2]]]),'%)',sep='') P=output_value(dobj,'loadings') Ev=output_value(dobj,'eigenvalues') sf = max(abs(Ts)) / max(abs(P)) * opt$scale_factor # plot A=data.frame("x"=P[,opt$components[1]]*sf,"y"=P[,opt$components[2]]*sf) C=pca_scores_plot(points_to_label=obj$points_to_label,xcol=obj$components[1],ycol=obj$components[2],factor_name=obj$factor_name) out=chart_plot(C,dobj) if (opt$style=='points') { out=out+ geom_point(data=A,inherit.aes = FALSE,color='black',mapping = aes_(x=~x,y=~y)) } if (opt$style=='arrows') { out=out+ geom_segment(data=A,inherit.aes = FALSE,color='black',mapping = aes_(x=~0,y=~0,xend=~x,yend=~y),arrow=arrow(length=unit(8,'points'))) } out=out+ggtitle('PCA biplot', subtitle=NULL) + xlab(xlabel) + ylab(ylabel) #label features if requested if (opt$label_features) { vlabels=rownames(dobj$loadings) for (i in 1:length(vlabels)) { vlabels[i]=paste0(' ',vlabels[i], ' ') } A$vlabels=vlabels out=out+ geom_text(data=A,aes_(x=~x,y=~y,label=~vlabels),vjust="inward",hjust="inward",inherit.aes = FALSE) } return(out) } ) ################################################################## ################################################################## #' @eval get_description('pca_loadings_plot') #' @import struct #' @export pca_loadings_plot #' @include PCA_class.R #' @examples #' C = pca_loadings_plot() pca_loadings_plot = function(components=c(1,2),style='points',label_features=NULL,...) { out=struct::new_struct('pca_loadings_plot', components=components, style=style, label_features=label_features, ...) return(out) } .pca_loadings_plot<-setClass( "pca_loadings_plot", contains='chart', slots=c( # INPUTS components='entity', style='enum', label_features='entity' ), prototype = list(name='PCA loadings plot', description=paste0('A barchart (one component) or scatter plot ', '(two components) of the selected principal component loadings.'), type="boxlot", .params=c('components','style','label_features'), components=entity(name='Components to plot', value=c(1,2), type='numeric', description='The principal components used to generate the plot.', max_length=2 ), style=enum(name='Plot style', value='points', type='character', description=c( "points" = 'Loadings and scores are plotted as a scatter plot.', 'arrows' = 'The loadings are plotted as arrow vectors.'), allowed=c('points','arrows') ), label_features=entity(name='Feature labels', value=NULL, type=c('character',"NULL"), description=c( 'character()'='A vector of labels for the features', 'NULL' = 'No labels', 'row.names' = 'Labels will be extracted from the column names of the data matrix.' ) ) ) ) #' @export #' @template chart_plot setMethod(f="chart_plot", signature=c("pca_loadings_plot",'PCA'), definition=function(obj,dobj) { opt=param_list(obj) P=output_value(dobj,'loadings') # 1D plot if (length(opt$components)==1) { A=data.frame("x"=1:nrow(P),"y"=P[,opt$components[1]]) out=ggplot(data=A,aes_(x=~x,y=~y)) + geom_line() + ggtitle('PCA Loadings', subtitle=NULL) + xlab('Feature') + ylab('Loading') + scale_colour_Publication() + theme_Publication(base_size = 12) } # 2D plot if (length(opt$components)==2) { A=data.frame("x"=P[,opt$components[1]],"y"=P[,opt$components[2]]) out=ggplot(data=A,aes_(x=~x,y=~y)) if (opt$style=='points'){ out=out+ geom_point(data=A,inherit.aes = FALSE,color='black',mapping = aes_(x=~x,y=~y)) } if (opt$style=='arrows'){ out=out+ geom_segment(data=A,inherit.aes = FALSE,color='black',mapping = aes_(x=~0,y=~0,xend=~x,yend=~y),arrow=arrow(length=unit(8,'points'))) } out= out + ggtitle('PCA Loadings', subtitle=NULL) + xlab(paste0('Component ',opt$components[1])) + ylab(paste0('Component ',opt$components[2])) + scale_colour_Publication() + theme_Publication(base_size = 12) if (!is.null(obj$label_features)) { if (obj$label_features=='row.names') { vlabels=rownames(dobj$loadings) } else { vlabels=obj$label_features } for (i in 1:length(vlabels)) { vlabels[i]=paste0(' ',vlabels[i], ' ') } A$vlabels=vlabels out=out+geom_text(data=A,aes_(x=~x,y=~y,label=~vlabels), vjust="inward",hjust="inward",inherit.aes = FALSE) } } if (length(opt$components)>2) { stop('can only plot loadings for 1 or 2 components at a time') } return(out) } ) #' @eval get_description('pca_scree_plot') #' @import struct #' @return struct object #' @export pca_scree_plot #' @include PCA_class.R #' @examples #' C = pca_scree_plot() pca_scree_plot = function(max_pc=15,...) { out=struct::new_struct('pca_scree_plot', max_pc=max_pc, ...) return(out) } .pca_scree_plot<-setClass( "pca_scree_plot", contains=c('chart'), slots=c(max_pc='entity'), prototype = list(name='Scree plot', description=paste0('A plot of the percent variance and cumulative ', 'percent variance for the components of a PCA model. '), type="line", ontology="STATO:0000386", .params=c('max_pc'), max_pc=entity( name='Maximum number of components', description='The maximum number of components to include in the plot.', value=15, type=c('numeric','integer'), max_length=1 ) ) ) #' @export #' @template chart_plot setMethod(f="chart_plot", signature=c("pca_scree_plot",'PCA'), definition=function(obj,dobj) { ## percent variance scores=output_value(dobj,'scores')$data pvar=(colSums(scores*scores)/output_value(dobj,'ssx'))*100 # only include up to max number of components pvar=pvar[1:min(c(obj$max_pc,length(pvar)))] A=data.frame("x"=1:length(pvar),"y"=c(pvar,cumsum(pvar)),"Variance"=as.factor(c(rep('Single component',length(pvar)),rep('Cumulative',length(pvar))))) labels=round(A$y,digits = 1) labels=format(labels,1) out=ggplot(data=A, aes_(x=~x,y=~y,color=~Variance)) + geom_line() + geom_point() + geom_text(aes_(label=~labels),color='black',vjust=0,nudge_y = 5) + ggtitle('Scree Plot', subtitle=NULL) + xlab('Component') + ylab('Variance (%)') + scale_colour_Publication() + scale_x_continuous(breaks=0:length(pvar)) + guides(color=guide_legend("")) + theme_Publication(base_size = 12) return(out) } ) #' @eval get_description('pca_dstat_plot') #' @import struct #' @export pca_dstat_plot #' @include PCA_class.R #' @examples #' C = pca_dstat_plot() pca_dstat_plot = function(number_components=2,alpha=0.05,...) { out=struct::new_struct('pca_dstat_plot', number_components=number_components, alpha=alpha, ...) return(out) } .pca_dstat_plot<-setClass( "pca_dstat_plot", contains=c('chart'), slots=c(number_components='entity', alpha='entity'), prototype = list(name='d-statistic plot', description=paste0('A bar chart of the d-statistics for samples in ', 'the input PCA model. Samples above the indicated threshold are ', 'considered to be outlying.'), type="bar", .params=c('number_components','alpha'), ontology='STATO:0000132', number_components=entity(value = 2, name = 'Number of principal components', description = 'The number of principal components to use.', type='numeric'), alpha=entity(value=0.95, name='Outlier threshold', description='A confidence threshold for rejecting samples based on the d-statistic', type='numeric') ) ) #' @export #' @template chart_plot setMethod(f="chart_plot", signature=c("pca_dstat_plot",'PCA'), definition=function(obj,dobj) { opt=param_list(obj) a=param_value(obj,'number_components') scores=output_value(dobj,'scores')$data I=nrow(scores) # number of samples sample_names=rownames(scores) scores=scores[,1:a] scores=as.matrix(scores) covT=t(scores) %*% scores # covariance matrix covT=solve(covT/(I-1)) # inverse H=numeric(length = I) for (i in 1:I) { H[i]=scores[i,,drop=FALSE]%*%covT%*%t(scores[i,,drop=FALSE]) } #leverage value # threshold at alpha F=qf(p = opt$alpha,df1 = a,df2=I-a) sf=(a*(I-1)*(I+1))/(I*(I-a)) threshold=sf*F # ggplot df=data.frame(x=sample_names,y=H) out=ggplot(data=df, aes_(x=~x,y=~y)) + geom_bar(stat="identity") + geom_hline(yintercept=threshold, linetype='dashed', color='grey') + ggtitle('d-statistic', subtitle=paste0('Number of components = ',a)) + xlab('Sample') + ylab('d-statistic') + scale_x_discrete (limits = sample_names,breaks=NULL) + scale_colour_Publication() + theme_Publication(base_size = 12) return(out) } ) circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){ r = diameter / 2 tt <- seq(0,2*pi,length.out = npoints) xx <- center[1] + r * cos(tt) yy <- center[2] + r * sin(tt) return(data.frame(x = xx, y = yy)) }