Bioconductor Code: scmeth

Raw Blame Patch Log History
#'Provides Coverage by the CpG density. CpG Density is defined as the number
#'of CpGs observed in certain base pair long region.
#'@param bs bsseq object
#'@param organism scientific name of the organism of interest,
#'e.g. Mmusculus or Hsapiens
#'@param windowLength Length of the window to calculate the density
#'@param small Indicator for a small dataset, cpg density is calculated more
#'memory efficiently for large dataset but for small dataset a different quicker
#'method is used
#'Default value for window length is 1000 basepairs.
#'@return Data frame with sample name and coverage in repeat masker regions
#'@examples
#'library(BSgenome.Hsapiens.NCBI.GRCh38)
#'directory <- system.file("extdata/bismark_data",package='scmeth')
#'bs <- HDF5Array::loadHDF5SummarizedExperiment(directory)
#'cpgDensity(bs,Hsapiens,1000,small=TRUE)
#'@import BSgenome
#'@importFrom bsseq getCoverage
#'@importFrom Biostrings DNAString
#'@importFrom Biostrings vmatchPattern
#'@export

cpgDensity <- function(bs,organism,windowLength=1000,small=FALSE){

    #GenomeInfoDb::seqlevelsStyle(gr) <- GenomeInfoDb::seqlevelsStyle(organism)[1]
    cov <- bsseq::getCoverage(bs)
    gr <- GenomicRanges::granges(bs)
    cpg <- Biostrings::DNAString("CG")

    if (!small){
      cpg_gr <- Biostrings::vmatchPattern(cpg, organism)
      cpg_gr <- GenomeInfoDb::keepStandardChromosomes(cpg_gr, pruning.mode= "coarse")

      r_cpg_gr <- GenomicRanges::resize(cpg_gr,width=(windowLength/2),fix='center')
      cpgd <- GenomicRanges::countOverlaps(gr,r_cpg_gr)

    }else{
      gr_resized <- GenomicRanges::resize(gr, width=(500/2),fix='center')
      v <- Biostrings::Views(organism, gr_resized)
      dnFrequency <- Biostrings::dinucleotideFrequency(v)
      cpgd <- dnFrequency[,'CG']
    }


    #cpgd <- Repitools::cpgDensityCalc(gr, organism, window = windowLength)

    maxcpgd <- max(cpgd)
    cpgdCov <- sapply(seq_len(ncol(cov)), function(i) {
        cv = as.vector(cov[,i])
        cpgdCell <- cpgd[cv>0 ]
        tab <- table(cpgdCell)
        x <- rep(0, maxcpgd)
        x[as.numeric(names(tab))] <- tab
        x
    })

    rownames(cpgdCov) <- seq_len(maxcpgd)
    return(cpgdCov)
}