R/get_read_entropy.R
4d80667c
 #' @importFrom Biostrings width
308ad71d
 # nocov start
1e9b7abf
 get_read_entropy <- function(bam_path, sample = fs::path_file(bam_path)) {
     # helper functions ----
     count_crossings <- function(numbers) {
         crossings <- sum(((numbers[-1] - 127.5) * (numbers[-length(numbers)] - 127.5)) < 0)
         return(crossings)
     }
 
     parse_chunk <- function(reads, sample) {
         reads <- reads[!is.null(reads$seq)]
         tibble::tibble(
             sample = factor(sample),
             read_name = reads$qname,
             chr = factor(reads$rname),
             pos = reads$pos,
             width = Biostrings::width(reads$seq),
19dc3f0c
             center = as.integer(.data$pos + .data$width / 2),
7f4623a1
             cg_count = purrr::map_int(as.character(reads$seq), count_cg_cpp),
1e9b7abf
             crossings = purrr::map_int(reads$tag$ML, count_crossings),
4d80667c
             entropy = .data$crossings / .data$cg_count
1e9b7abf
         )
     }
 
     # function body ----
     fname <- fs::path_file(bam_path)
         cli::cli_progress_bar(
             glue::glue("Parsing file: {fname}"),
             total = get_bam_total_reads(bam_path),
             format_done = paste0(
                 "{.alert-success Data parsed: ", fname, " {.timestamp {cli::pb_elapsed}}}"),
             format_failed = paste0(
                 "{.alert-danger Data parsing failed: ", fname, " {.timestamp {cli::pb_elapsed}}}"),
             clear = FALSE
         )
 
     bam_file <- Rsamtools::BamFile(bam_path, yieldSize = 15000)
 
     i <- 1
     df_list <- list()
     open(bam_file)
     while (Rsamtools::isIncomplete(bam_file)) {
ee042bae
         reads <- read_bam(bam_file)[[1]]
1e9b7abf
         df_list[[i]] <- parse_chunk(reads, sample)
         i <- i + 1
57f4eaf8
         cli::cli_progress_update(length(reads[[1]]))
1e9b7abf
     }
     close(bam_file)
 
     bind_rows(df_list)
 }
308ad71d
 # nocov end