Bioconductor Code: structToolbox

Raw Blame Patch Log History
#' PLSR model class
#'
#' Partial least squares (PLS) Regression model class. This object can be used to train/apply PLS models.
#' @export PLSR
#' @importFrom pls plsr scores
PLSR<-setClass(

  "PLSR",
  contains='model',

  slots=c(
    params.number_components = 'entity',
    params.factor_name       = 'entity',
    outputs.scores           = 'data.frame',
    outputs.loadings         = 'data.frame',
    outputs.yhat             = 'data.frame',
    outputs.y                = 'data.frame',
    outputs.reg_coeff        = 'data.frame',
    outputs.vip              = 'data.frame',
    outputs.pls_model        = 'list',
    outputs.pred             = 'data.frame'
  ),

  prototype = list(name='Partial least squares regression',
    type="regression",
    predicted='pred',
    params.number_components=entity(value = 2,name = 'Number of PLS components',description = 'The number of PLS components to use',type = 'numeric'),
    params.factor_name=entity(name='Factor name', description='A vector of sample_meta column names to use')
    )

)

#' @export
setMethod(f="model.train",
  signature=c("PLSR",'dataset'),
  definition=function(M,D)
  {
    y=dataset.sample_meta(D)
    y=y[,M$factor_name] # might be multiple columns (?)

    output.value(M,'y')=as.data.frame(y)

    X=as.matrix(dataset.data(D)) # convert X to matrix
    y=as.matrix(y)
    #print(param.value(M,'number_components'))
    pls_model=pls::plsr(y ~ X,validation="none",
                              method='oscorespls',
                              model=TRUE,
                              ncomp=param.value(M,'number_components'),
                              center=FALSE,
                              scale=FALSE)
    ny=ncol(y)
    nr=ncol(output.value(M,'reg_coeff'))
    output.value(M,'reg_coeff')=as.data.frame(output.value(M,'reg_coeff')[,(nr-ny+1):nr]) # keep only the requested number of components
    output.value(M,'vip')=as.data.frame(vips(pls_model))
    yhat=predict(pls_model, ncomp = param.value(M,'number_components'), newdata = X)
    yhat=yhat[,,dim(yhat)[3]]
    output.value(M,'yhat')=as.data.frame(yhat)

    output.value(M,'pls_model')=list(pls_model)
    scores=pls::scores(pls_model)
    output.value(M,'scores')=as.data.frame(matrix(scores,nrow = nrow(scores),ncol=ncol(scores)))
    return(M)
  }
)

#' @export
setMethod(f="model.predict",
  signature=c("PLSR",'dataset'),
  definition=function(M,D)
  {
    # convert X to matrix
    X=as.matrix(dataset.data(D))
    # get training set y
    y=output.value(M,'y')

    # get predictions
    p=predict(output.value(M,'pls_model')[[1]], ncomp = param.value(M,'number_components'), newdata = X)
    p=p[,,dim(p)[3]]

    q=data.frame("pred"=p)
    output.value(M,'pred')=q
    return(M)
  }
)



vips<-function(object)
{
  # modified from http://mevik.net/work/software/pls.html
  q=object$Yloadings
  ny=nrow(object$Yloadings) # number of groups
  vip=matrix(0,nrow=nrow(object$loadings),ny)
  for (i in 1:ny)
  {
    qq=q[i,,drop=FALSE] # for group i only
    SS <- c(qq)^2 * colSums(object$scores^2)
    Wnorm2 <- colSums(object$loading.weights^2)
    SSW <- sweep(object$loading.weights^2, 2, SS / Wnorm2, "*")
    v=sqrt(nrow(SSW) * apply(SSW, 1, cumsum) / cumsum(SS))
    if (ncol(SSW)>1)
      vip[,i]=t(v[nrow(v),,drop=FALSE]) # get number of calculated components only
    else
    {
      vip[,i]=t(v)
    }
  }
  return(vip)
}

#' plsr_prediction_plot class
#'
#' plots the true values against the predicted values for a PLSR model.
#'
#' @import struct
#' @export plsr_prediction_plot
#' @include PLSR_class.R
plsr_prediction_plot<-setClass(
  "plsr_prediction_plot",
  contains='chart',
  prototype = list(name='PLSR prediction plot',
    description='plots the true values against the predicted values for a PLSR model.',
    type="scatterplot"
  )
)

#' @export
setMethod(f="chart.plot",
  signature=c("plsr_prediction_plot",'PLSR'),
  definition=function(obj,dobj)
  {
    R2=r_squared()
    R2=calculate(R2,dobj$y[,1],dobj$yhat[,1])

    B=data.frame(x=rep(dobj$y[,1],2),y=c(dobj$y[,1],dobj$yhat[,1]),group=rep(1:nrow(dobj$y),2))
    A=data.frame(x=dobj$y[,1],y=dobj$yhat[,1])
    p=ggplot(data=A,aes(x=x,y=y)) +
      geom_line(data=B,aes(x=x,y=y,group=group),color='#B2B2B2') +
      geom_point(color="blue") +
      geom_abline(slope=1,intercept=0,color="red")+
      scale_colour_Publication() +
      theme_Publication(base_size = 12) +
      xlab('True values') +
      ylab('Predicted values') +


      annotate("text",x=-Inf,y=Inf,label=paste0('R^2 == ',format(value(R2),digits = 2)),vjust=2,hjust=-1,parse=TRUE)
    return(p)
  }
)

#' plsr_residual_hist class
#'
#' plots a histogram of the residuals
#'
#' @import struct
#' @export plsr_residual_hist
#' @include PLSR_class.R
plsr_residual_hist<-setClass(
  "plsr_residual_hist",
  contains='chart',
  prototype = list(name='PLSR residuals histogram',
    description='Plots a histogram of the residuals for a PLSR model.',
    type="histogram"
  )
)

#' @export
setMethod(f="chart.plot",
  signature=c("plsr_residual_hist",'PLSR'),
  definition=function(obj,dobj)
  {
    d=dobj$y[,1]-dobj$yhat[,1]
    bw=0.2
    n=length(d)
    nx=seq(min(d),max(d),length.out = 100)
    nc=dnorm(nx,0,sd(d))*n*bw

    B=data.frame(nx,nc)
    A=data.frame(y=d)
    p=ggplot(data=A,aes(x=y)) +
      geom_histogram(color="#10C8CD",fill="#b2e9eb",binwidth=0.2) +
      geom_line(data=B,aes(x=nx,y=nc),color="blue") +
      scale_colour_Publication() +
      theme_Publication(base_size = 12) +
      xlab('Residual')+
      ylab('Count')

    return(p)
  }
)

#' plsr_qq_plot class
#'
#' plots the quantiles of PLSR residuals against the qualtiles of a normal
#' distribution
#'
#' @import struct
#' @export plsr_qq_plot
#' @include PLSR_class.R
plsr_qq_plot<-setClass(
  "plsr_qq_plot",
  contains='chart',
  prototype = list(name='PLSR QQ plot',
    description='plots the quantiles of PLSR residuals against the qualtiles of a normal
    distribution',
    type="scatter"
  )
)

#' @export
setMethod(f="chart.plot",
  signature=c("plsr_qq_plot",'PLSR'),
  definition=function(obj,dobj)
  {

    x=sort(dobj$y[,1]-dobj$yhat[,1])
    p=seq(0,1,length.out = length(x))
    q=qnorm(p,0,sd(x))
    A=data.frame('x'=q,'y'=x)
    p=ggplot(data=A,aes(x=x,y=y)) +
      geom_point(color="blue")+
      geom_abline(slope=1,intercept=0,color="red")+

      scale_colour_Publication() +
      theme_Publication(base_size = 12) +
      xlab('Theoretical quantiles')+
      ylab('Residuals quantiles')

    return(p)
  }
)

#' plsr_residual_plot class
#'
#' Plots the residuals for a PLSR model
#'
#' @import struct
#' @export plsr_cook_dist
#' @include PLSR_class.R
plsr_cook_dist<-setClass(
  "plsr_cook_dist",
  contains='chart',
  prototype = list(name='Cook distance barchart',
    description='Plots Cook\'s distance for each sample of a PLSR model',
    type="barchart"
  )
)

#' @export
setMethod(f="chart.plot",
  signature=c("plsr_cook_dist",'PLSR'),
  definition=function(obj,dobj)
  {

    # residual
    e=as.matrix(dobj$y-dobj$yhat)^2 # e^2

    # leverage
    T=as.matrix(dobj$scores)
    H=T %*% solve(t(T) %*% T) %*% t(T)
    H=diag(H)
    s2=(1/(nrow(T)-ncol(T))) * sum(e)

    # cook distance
    CD=(e/(s2*ncol(T)))*(H/((1-H)^2))

    A=data.frame(x=seq_len(length(CD)),y=CD)
    p=ggplot(data=A,aes(x=x,y=y)) +
      geom_col(width=1,color="black",fill='#B2B2B2')+
      scale_colour_Publication() +
      theme_Publication(base_size = 12) +
      xlab('Sample number')+
      ylab('Cook\'s distance')+
      geom_hline(yintercept=4/(nrow(T)-ncol(T)),color='red')


    return(p)
  }
)