R/cpgDensity.R
be7d4113
 #'Provides Coverage by the CpG density. CpG Density is defined as the number
 #'of CpGs observed in certain base pair long region.
05333069
 #'@param bs bsseq object
ba93b446
 #'@param organism scientific name of the organism of interest,
1e49b4be
 #'e.g. Mmusculus or Hsapiens
662e4406
 #'@param windowLength Length of the window to calculate the density
6a4a0d3d
 #'@param small Indicator for a small dataset, cpg density is calculated more
 #'memory efficiently for large dataset but for small dataset a different quicker
 #'method is used
be7d4113
 #'Default value for window length is 1000 basepairs.
05333069
 #'@return Data frame with sample name and coverage in repeat masker regions
 #'@examples
8061babb
 #'library(BSgenome.Hsapiens.NCBI.GRCh38)
be7d4113
 #'directory <- system.file("extdata/bismark_data",package='scmeth')
 #'bs <- HDF5Array::loadHDF5SummarizedExperiment(directory)
6a4a0d3d
 #'cpgDensity(bs,Hsapiens,1000,small=TRUE)
cb356578
 #'@import BSgenome
efa5aeca
 #'@importFrom bsseq getCoverage
12ffa4ac
 #'@importFrom Biostrings DNAString
 #'@importFrom Biostrings vmatchPattern
05333069
 #'@export
f260e769
 
6a4a0d3d
 cpgDensity <- function(bs,organism,windowLength=1000,small=FALSE){
05333069
 
be7d4113
     #GenomeInfoDb::seqlevelsStyle(gr) <- GenomeInfoDb::seqlevelsStyle(organism)[1]
12ffa4ac
     cov <- bsseq::getCoverage(bs)
     gr <- GenomicRanges::granges(bs)
     cpg <- Biostrings::DNAString("CG")
 
6a4a0d3d
     if (!small){
       cpg_gr <- Biostrings::vmatchPattern(cpg, organism)
       cpg_gr <- GenomeInfoDb::keepStandardChromosomes(cpg_gr, pruning.mode= "coarse")
 
       r_cpg_gr <- GenomicRanges::resize(cpg_gr,width=(windowLength/2),fix='center')
       cpgd <- GenomicRanges::countOverlaps(gr,r_cpg_gr)
 
     }else{
       gr_resized <- GenomicRanges::resize(gr, width=(500/2),fix='center')
       v <- Biostrings::Views(organism, gr_resized)
       dnFrequency <- Biostrings::dinucleotideFrequency(v)
       cpgd <- dnFrequency[,'CG']
     }
 
12ffa4ac
 
     #cpgd <- Repitools::cpgDensityCalc(gr, organism, window = windowLength)
2351755b
 
be7d4113
     maxcpgd <- max(cpgd)
     cpgdCov <- sapply(seq_len(ncol(cov)), function(i) {
e8f2b9b4
         cv = as.vector(cov[,i])
be7d4113
         cpgdCell <- cpgd[cv>0 ]
e8f2b9b4
         tab <- table(cpgdCell)
         x <- rep(0, maxcpgd)
         x[as.numeric(names(tab))] <- tab
         x
2351755b
     })
 
be7d4113
     rownames(cpgdCov) <- seq_len(maxcpgd)
7a46bec0
     return(cpgdCov)
05333069
 }