Bioconductor Code: structToolbox

Raw Blame Patch Log History
#' @eval get_description('fold_change')
#' @examples
#' D = MTBLS79_DatasetExperiment()
#' M = fold_change(factor_name='Class')
#' M = model_apply(M,D)
#' @import stats
#' @export fold_change
fold_change = function(
    factor_name,
    paired=FALSE,
    sample_name=character(0),
    threshold=2,
    control_group=character(0),
    method = "geometric",
    conf_level = 0.95,
    ...) {
    
    out=struct::new_struct('fold_change',
        factor_name=factor_name,
        paired=paired,
        sample_name=sample_name,
        threshold=threshold,
        control_group=control_group,
        method = method,
        conf_level=conf_level,
        ...)
    
    return(out)
}


.fold_change<-setClass(
    "fold_change",
    contains=c('model'),
    slots=c(
        # INPUTS
        factor_name='entity',
        paired='entity',
        sample_name='entity',
        threshold='entity',
        control_group='entity',
        method='entity',
        conf_level='entity',
        
        # OUTPUTS
        fold_change='entity',
        upper_ci='entity',
        lower_ci='entity',
        significant='entity'
    ),
    prototype = list(name='Fold change',
        description=paste0('Fold change is the relative change in mean (or ',
            'non-parametric equivalent) intensities of a feature between all pairs of levels ',
            'in a factor.'),
        type="univariate",
        predicted='fold_change',
        citations=list(
            bibentry(
                bibtype='Article',
                year=2020,
                volume=45,
                number=6,
                pages='750-770',
                author=as.person('Robert M. Price Jr and Douglas G. Bonett'),
                title='Confidence Intervals for Ratios of Means and Medians',
                journal='Journal of Educational and Behavioral Statistics'
            )
        ),
        #  ontology="STATO:0000304",
        .params=c('factor_name','sample_name','paired','threshold','control_group','method','conf_level'),
        .outputs=c('fold_change','lower_ci','upper_ci','significant'),
        
        factor_name=ents$factor_name,
        sample_name=entity(name='Sample names',
            type='character',
            description='The name of a sample_meta column containing sample identifiers for paired sampling.'
        ),

        paired=entity(name='Paired fold change',
            value=FALSE,
            type='logical',
            description=c(
                'TRUE' = 'Fold change is calculated taking into account paired sampling.',
                'FALSE'= 'Fold change is calculated assuming there is no paired sampling.'),
        ),
        
        fold_change=entity(name='Fold change',
            type='data.frame',
            description='The fold change between groups',
            value=data.frame()
        ),
        
        lower_ci=entity(name='Lower confidence interval',
            type='data.frame',
            description='Lower confidence interval for fold change',
            value=data.frame()
        ),
        upper_ci=entity(name='Upper confidence interval',
            type='data.frame',
            description='Upper confidence interval for fold change.',
            value=data.frame()
        ),
        method=enum(name='Fold change method',
            type='character',
            description=c(
                'geometric' = paste0('A log transform is applied before using ',
                    'group means to calculate fold change. In the non-tranformed',
                    'space this is equivalent to using geometric group means. ',
                    'Confidence intervals for independant and paired sampling ',
                    'are estimated using standard error of the mean in log ',
                    'transformed space before being transformed back to the ',
                    'original space.'),
                "median" = paste0('The group medians and the method described ',
                    'by Price and Bonett is used to ',
                    'estimate confidence intervals. For paired data standard ',
                    'error of the median is used to estimate confidence ',
                    'intervals from the median fold change of all pairs.'),
                "mean" = paste0('The group means and the method described by ',
                    'Price and Bonnet is used to estimate confidence ',
                    'intervals. For paired data standard error of the mean is ',
                    'used to estimate confidence intervals from the mean ',
                    'fold change of all pairs.')
            ),
            value="geometric",
            allowed=c("geometric","median","mean")
        ),
        control_group=entity(
            name='Control group',
            description = paste0('The level name of the group used in ',
                'the denominator (where possible) when computing fold change.'
            ),
            type='character',
            max_length = 1,
            value=character(0)
        ),
        threshold=entity(
            name='threshold',
            description = paste0('The fold change threshold for labelling ',
                'features as significant.'
            ),
            type='numeric',
            max_length = 1,
            value=2
        ),
        significant = entity(
            name='Significant features',
            description=paste0('A logical indictor of whether the calculated ',
            'fold change including the estimated confidence limits is greater ',
            'than the selected threshold.'),
            type='data.frame',
            value=data.frame()
        ),
        conf_level = entity(
            name = 'Confidence level',
            description = 'The confidence level of the interval.',
            type='numeric',
            value=0.95,
            max_length = 1
        )
    )
)


#' @export
#' @template model_apply
setMethod(f="model_apply",
    signature=c("fold_change",'DatasetExperiment'),
    definition=function(M,D)
    {
        # log transform
        if (M$method=='geometric') {
            # log transform
            D$data=log2(D$data)
        }
        X=D$data
        Y=D$sample_meta
        
        # levels for factor of interest
        L=levels(as.factor(Y[[M$factor_name]]))
        L=rev(L)
        # put control group last, if provided
        if (length(M$control_group)>0) {
            w=which(L==M$control_group)
            if (length(w)>0) {
                L=c(L[-w],L[w])
            }
        }
        
        # number of pairwise comparisons
        n=((length(L)^2)-length(L))/2
        
        # prep some matrices
        FC=matrix(NA,nrow=ncol(X),ncol=n)
        rownames(FC)=colnames(D)
        LCI=FC
        UCI=FC
        
        comp=character(0)
        
        counter=1
        
        D$sample_meta[[M$factor_name]]=ordered(D$sample_meta[[M$factor_name]])
        
        # for all pairs of groups
        for (A in 1:(length(L)-1)) {
            for (B in (A+1):(length(L))) {
                
                # filter groups to A and B
                FG=filter_smeta(factor_name=M$factor_name,mode='include',levels=L[c(A,B)])
                FG=model_apply(FG,D)
                # change to ordered factor so that we make use of control group
                FG$filtered$sample_meta[[M$factor_name]]=ordered(FG$filtered$sample_meta[[M$factor_name]],levels=L[c(A,B)])
                
                if (M$method=='geometric') {
                    
                    # apply t-test
                    TT=ttest(alpha=0.05,mtc='none',factor_names=M$factor_name,paired=M$paired,paired_factor=M$sample_name,conf_level=M$conf_level)
                    TT=model_apply(TT,predicted(FG))
                    # log2(fold change) is the difference in estimate.mean from ttest
                    if (M$paired) {
                        fc=TT$estimates[,1]
                    } else {
                        fc=TT$estimates[,1]-TT$estimates[,2] # log2 fold change
                    }
                    FC[,counter]=fc
                    LCI[,counter]=TT$conf_int[,1]
                    UCI[,counter]=TT$conf_int[,2]
                    counter=counter+1
                    comp=c(comp,paste0(L[A],'/',L[B]))
                    
                } else if (M$method == 'median') {
                    
                    D2 = predicted(FG)
                    # check for pairs in features
                    if (M$paired) {
                        FF=pairs_filter(
                            factor_name=M$factor_name,
                            sample_id=M$sample_name)
                        FF=model_apply(FF,D2)
                        D2=predicted(FF)
                    }
                    
                    
                    # calculate medians and confidence intervals for all features
                    out=lapply(D2$data,function(x) {
                        y1=x[D2$sample_meta[[M$factor_name]]==L[A]]
                        y2=x[D2$sample_meta[[M$factor_name]]==L[B]]
                        
                        if (M$paired) {
                            # if one of pair is NA, set both to NA
                            y1[is.na(y2)]=NA
                            y2[is.na(y1)]=NA
                        }
                        
                        if (all(is.na(y1)) | all(is.na(y2))) {
                            val=rep(NA,3) # report NA if a group is missing
                        } else {
                            val=ci_delta_nu(y1,y2,alpha=1-M$conf_level,paired=M$paired)
                        }
                        
                        val=matrix(val,nrow=1,ncol=3)
                        colnames(val)=c('median','lci','uci')
                        return(val)
                    })
                    
                    nmes=names(out)
                    out=do.call(rbind,out)
                    rownames(out)=nmes
                    
                    # merge with original names
                    temp=merge(
                        FC[,counter,drop=FALSE],
                        out,
                        by = 'row.names',
                        all.x=TRUE,
                    )
                    rownames(temp)=temp$Row.names
                    # sort back into original order
                    temp=temp[rownames(FC),]
                    
                    FC[,counter]=temp$median
                    LCI[,counter]=temp$lci
                    UCI[,counter]=temp$uci
                    counter=counter+1
                    comp=c(comp,paste0(L[A],'/',L[B]))
                    
                } else if (M$method == 'mean') {
                    D2 = predicted(FG)
                    # check for pairs in features
                    if (M$paired) {
                        FF=pairs_filter(
                            factor_name=M$factor_name,
                            sample_id=M$sample_name)
                        FF=model_apply(FF,D2)
                        D2=predicted(FF)
                        
                        # calculate means and confidence intervals for all features
                        out=lapply(D2$data,function(x) {
                            
                            y1=x[D2$sample_meta[[M$factor_name]]==L[A]]
                            y2=x[D2$sample_meta[[M$factor_name]]==L[B]]
                            
                            if (M$paired) {
                                # if one of pair is NA, set both to NA
                                y1[is.na(y2)]=NA
                                y2[is.na(y1)]=NA
                            }
                            
                            if (all(is.na(y1)) | all(is.na(y2))) {
                                ret=matrix(NA,nrow=1,ncol=3) # report NA if a group is missing
                                colnames(ret)=c('fold_change','lower_ci','upper_ci')
                            } else {
                                ret=ci.mean.paired(1-M$conf_level,y1,y2)
                            }
                            return(ret)
                        })
                        
                        
                    } else {
                        # calculate means and confidence intervals for all features
                        out=lapply(D2$data,function(x) {
                            y1=x[D2$sample_meta[[M$factor_name]]==L[A]]
                            y2=x[D2$sample_meta[[M$factor_name]]==L[B]]
                            
                            if (all(is.na(y1)) | all(is.na(y2))) {
                                ret=matrix(NA,nrow=1,ncol=8) # report NA if a group is missing
                                colnames(ret)=c("Mean1", "Mean2", "df", "  Mean1/Mean2", "LL", "UL", "  Log-ratio", "SE")
                            } else {
                                ret=ci.mean.bs(1-M$conf_level,na.exclude(y1),na.exclude(y2))
                            }
                            return(ret)
                        })
                        
                        
                    }
                    nmes=names(out)
                    out=do.call(rbind,out)
                    rownames(out)=nmes
                    
                    # merge with original names
                    temp=merge(
                        FC[,counter,drop=FALSE],
                        out,
                        by = 'row.names',
                        all.x=TRUE,
                    )
                    rownames(temp)=temp$Row.names
                    # sort back into original order
                    temp=temp[rownames(FC),]
                    
                    if (M$paired) {
                        FC[,counter]=temp[,3]
                        LCI[,counter]=temp[,4]
                        UCI[,counter]=temp[,5]
                    } else {
                        FC[,counter]=temp[,6]
                        LCI[,counter]=temp[,7]
                        UCI[,counter]=temp[,8]
                    }
                    counter=counter+1
                    comp=c(comp,paste0(L[A],'/',L[B]))
                }
            }
        }

        colnames(FC)=comp
        colnames(LCI)=comp
        colnames(UCI)=comp

        # store in object
        if (M$method=='geometric') {
            #store in object
            M$fold_change=as.data.frame(2^FC)
            M$lower_ci=as.data.frame(2^LCI)
            M$upper_ci=as.data.frame(2^UCI) 
        } else {
            M$fold_change = as.data.frame(FC)
            M$lower_ci = as.data.frame(LCI)
            M$upper_ci = as.data.frame(UCI)
        }
        
        check1 = M$lower_ci > M$threshold
        check2 = M$upper_ci < 1/M$threshold
        M$significant = data.frame(significant=check1 | check2)
        colnames(M$significant)=comp
        rownames(M$significant)=colnames(D)
        return(M)
    }
)

#' @eval get_description('fold_change_plot')
#' @export fold_change_plot
#' @include PCA_class.R
#' @examples
#' C = fold_change_plot()
fold_change_plot = function(number_features=20,orientation='portrait',...) {
    out=struct::new_struct('fold_change_plot',
        number_features=number_features,
        orientation=orientation,
        ...)
    return(out)
}


.fold_change_plot<-setClass(
    "fold_change_plot",
    contains='chart',
    slots=c(
        number_features='entity',
        orientation='entity'),
    prototype = list(name='Fold change plot',
        description=paste0('A plot of fold changes calculated for a chosen ',
        'subset of features. A predefined fold change threshold is indicated ',
        'by shaded regions.'),
        type="boxlot",
        .params=c('number_features','orientation'),
        number_features=entity(
            name='Features to plot',
            description = 'The number randomly selected features to plot, or 
            a list of column numbers.',
            type='numeric',
            value=10
        ),
        orientation=enum(
            name='Plot orientation',
            description = c(
            'landscape' = 'Features are plotted on the y-axis.',
            'portrait' = 'Features are plotted on the x-axis.'
            ),
            type='character',
            value='portrait',
            allowed=c('portrait','landscape'),
            max_length=1
        )
    )
    
)

#' @export
#' @template chart_plot
setMethod(f="chart_plot",
    signature=c("fold_change_plot",'fold_change'),
    definition=function(obj,dobj)
    {
        
        # choose randomly,or use provided vector
        if (length(obj$number_features)==1) {
            S=sample.int(nrow(dobj$fold_change),size=obj$number_features)
        } else {
            S=obj$number_features
        }
        
        
        A=data.frame(
            'fc'=dobj$fold_change[S,1],
            'group'=colnames(dobj$fold_change)[1],
            'lci'=dobj$lower_ci[S,1],
            'uci'=dobj$upper_ci[S,1],
            'feature'=rownames(dobj$fold_change)[S])
        if (ncol(dobj$fold_change)>1) {
            for (k in 2:ncol(dobj$fold_change)) {
                B=data.frame(
                    'fc'=dobj$fold_change[S,k],
                    'group'=colnames(dobj$fold_change)[k],
                    'lci'=dobj$lower_ci[S,k],
                    'uci'=dobj$upper_ci[S,k],
                    'feature'=rownames(dobj$fold_change)[S])
                A=rbind(A,B)
            }
        }
        A[,c(1,3,4)]=log2(A[,c(1,3,4)])
        A$x=1:nrow(A)
        
        out=ggplot(data=A,aes(x=feature,y=fc,color=group)) +
            geom_rect(xmin=-Inf,xmax=Inf,ymin=-Inf,ymax=-log2(dobj$threshold),fill='#9EC086',inherit.aes = FALSE,alpha=0.02) +
            geom_rect(xmin=-Inf,xmax=Inf,ymax=Inf,ymin=log2(dobj$threshold),fill='#9EC086',inherit.aes = FALSE,alpha=0.02) +
            #geom_rect(xmin=-Inf,xmax=Inf,ymax=log2(M$threshold),ymin=-log2(M$threshold),fill='#B6B6B6',inherit.aes = FALSE,alpha=0.02) +
            geom_pointrange(aes(ymin=lci,ymax=uci),position=position_dodge(width=0.75)) +
            #geom_hline(yintercept = log2(dobj$threshold),color='red') +
            #geom_hline(yintercept = -log2(dobj$threshold),color='red') +
            xlab('Feature') +
            ylab('log2(Fold change)')+
            scale_colour_Publication() +
            theme_Publication(base_size = 12)
        
        if (obj$orientation=='landscape') {
            out=out+coord_flip()
            
        } else {
            out=out+theme(axis.text.x = element_text(angle = 90,hjust=1,vjust=0.5))
        }
        
        return(out)
    }
)





ci_delta_nu = function(y1,y2,alpha=0.05,paired=FALSE) {
    
    if (!paired) {
        out=ci.median.bs(alpha=alpha,y1=y1,y2=y2)
        return(out)
    } else {
        if (length(y1) != length(y2)) {
            stop('the same number of samples must be present in all groups for a paired comparison')
        } 
        r = y1/y2
        r=r[!is.na(r)]
        mr=mean(r)
        md=median(r)
        s=sd(r)/sqrt(length(r)) # standard error of mean
        z=qt(1-(alpha/2),length(r)-1)
        sf=sqrt(pi/2)
        
        out=t(c(md,md-(sf*z*s),md+(sf*z*s)))
        colnames(out)=c('fold_change','lower_ci','upper_ci')
        
        return(out)
    }
    
}

ci.mean.bs <- function(alpha, y1, y2){
    # from 10.3102/1076998620934125
    # Compute confidence interval for a ratio of population means of ratio-scale
    # measurements in a design with a 2-level between-subjects factor. 
    # Arguments:
    #   alpha:  alpha level for 1-alpha confidence
    #   y1      n1 x 1 vector of scores for group 1
    #   y2:     n2 x 1 vector of scores for group 2
    # Returns:
    #   confidence interval
    y1=y1[!is.na(y1)]
    y2=y2[!is.na(y2)]
    
    n1 <- length(y1)
    n2 <- length(y2)
    m1 <- mean(y1)
    m2 <- mean(y2)
    v1 <- var(y1)
    v2 <- var(y2)
    var <- v1/(n1*m1^2) + v2/(n2*m2^2)
    df <- var^2/(v1^2/(m1^4*(n1^3 - n1^2)) + v2^2/(m2^4*(n2^3 - n2^2)))
    t <- qt(1 - alpha/2, df)
    est <- log(m1/m2)
    se <- sqrt(var)
    ll <- exp(est - t*se)
    ul <- exp(est + t*se)
    out <- t(c(m1, m2, df, exp(est), ll, ul, est, se))
    colnames(out) <- c("Mean1", "Mean2", "df", "  Mean1/Mean2", "LL", "UL", "  Log-ratio", "SE")
    return(out)
}



ci.mean.paired = function(alpha,x,y) {
    r = x/y
    r=r[!is.na(r)]
    
    mr=mean(r)

    s=sd(r)/sqrt(length(r)) # standard error of mean
    z=qt(1-(alpha/2),length(r)-1)
    
    out=t(c(mr,mr-(z*s),mr+z*s))
    colnames(out)=c('fold_change','lower_ci','upper_ci')
    
    return(out)
}



ci.median.bs <- function(alpha, y1, y2) {
    # from 10.3102/1076998620934125
    # Computes confidence interval for a ratio of population medians of ratio-scale
    # measurements in a design with a 2-level between-subjects factor.
    # Arguments:
    #   alpha: alpha level for 1-alpha confidence
    #   y1:    n1 x 1 vector of scores for group 1
    #   y2:    n2 x 1 vector of scores for group 2  
    # Returns:
    #   confidence interval
    y1=y1[!is.na(y1)]
    y2=y2[!is.na(y2)]
    z <- qnorm(1 - alpha/2)
    n1 <- length(y1)
    y1 <- sort(y1)
    n2 <- length(y2)
    y2 <- sort(y2)
    med1 <- median(y1)
    med2 <- median(y2)
    o1 <- round(n1/2 - sqrt(n1))
    if (o1 < 1) {o1 = 1}
    o2 <- n1 - o1 + 1
    l1 <- log(y1[o1])
    u1 <- log(y1[o2])
    p <- pbinom(o1 - 1, size = n1, prob = .5)
    z0 <- qnorm(1 - p)
    se1 <- (u1 - l1)/(2*z0)
    o1 <- round(n2/2 - sqrt(n2))
    if (o1 < 1) {o1 = 1}
    o2 <- n2 - o1 + 1
    l2 <- log(y2[o1])
    u2 <- log(y2[o2])
    p <- pbinom(o1 - 1, size = n2, prob = .5)
    z0 <- qnorm(1 - p)
    se2 <- (u2 - l2)/(2*z0)
    se <- sqrt(se1^2 + se2^2)
    logratio <- log(med1/med2)
    ll <- exp(logratio - z*se)
    ul <- exp(logratio + z*se)
    out <- t(c(exp(logratio), ll, ul))
    colnames(out) <- c("fold_change", "LL", "UL")
    return(out)
}