#' Discretize the CpG methylation values
#' to align with single cell analysis
#'
#'In single cell analysis overwhelmingly large number of CpGs have binary
#'methylation
#'Due to errors in sequencing and amplification many CpGs tend to have
#'non-binary methylation. Hence
#'this function catergorizes the non-binary CpGs as methylated if the
#'methyation is above 0.8 and
#'unmethylated if the methylation is below 0.2
#'@param bs bsseq object
#'@param subSample number of CpGs to subsample.
#'Default value is 1000000.
#'@param offset how many CpGs to offset when subsampling
#'Default value is set to be 50000, i.e. first 50000 CpGs will
#'be ignored in subsampling.
#'@param coverageVec If coverage vector is already calculated provide it to
#'speed up the process
#'@return meth discretized methylation matrix
#'@return discard total number of removed CpGs from each sample
#'@return Percentage of CpGs discarded compared to the total number of CpGs
#'@examples
#'directory <- system.file("extdata/bismark_data",package='scmeth')
#'bs <- HDF5Array::loadHDF5SummarizedExperiment(directory)
#'cpgDiscretization(bs)
#'@importFrom DelayedArray colSums
#'@importFrom bsseq getCoverage
#'@export


cpgDiscretization <- function(bs,subSample=1e6,offset=50000,coverageVec=NULL){
    # subsampling
    nCpGs <- nrow(bs)

    if (nCpGs<(subSample+offset)){
        bs <- bs
        subSample <- nCpGs
    }else{
        bs <- bs[offset:(subSample+offset)]
    }

    covMatrix <- bsseq::getCoverage(bs)
    methMatrix <- bsseq::getCoverage(bs,type='M')
    nSamples <- ncol(methMatrix)
    methMatrix <- methMatrix/covMatrix
    if (is.null(coverageVec)){
        covVec <- DelayedArray::colSums(covMatrix>0,na.rm=TRUE)
        covVec <- covVec*(nCpGs/subSample)
    }else{
        covVec <- coverageVec
    }

    # Only consider methylation between 0.2 and 0.8
    methCutOff <- c(0.01,0.19,0.79,1.0)

    methylationDistMatrix <- sapply(seq_len(nSamples), function(i) {
        mv = as.vector(methMatrix[,i])
        mv <- mv[!is.na(mv)]
        mvBin <- cut(mv,methCutOff)
        tab <- table(mvBin)
        x <- tab
        x
    })

    removedCpGs <- methylationDistMatrix[2,]*(nCpGs/subSample)
    removedCpGFrac <- (removedCpGs/(covVec))*100
    returnList <- list('discard' = removedCpGs,
                        'discardPerc' = removedCpGFrac)
    return(returnList)
}