R/cluster_reads.R
34abd9e3
 #' Cluster reads based on methylation
 #'
 #' @param x a ModBamResult object.
 #' @param chr the chromosome name where to find the region.
 #' @param start the start position of the region.
 #' @param end the end position of the region.
 #' @param min_pts the minimum number of points needed to form a cluster (default = 10).
 #'
 #' @return A tibble with information about each read's cluster assignment and read statistics.
 #'
 #' @import tidyr
 #' @import dplyr
 #' @import dbscan
 #' @importFrom tibble rownames_to_column
41085636
 cluster_reads <- function(x, chr, start, end, min_pts = 5) {
22cea658
     assertthat::assert_that(
         is(x, "ModBamResult"),
         assertthat::is.string(chr) || (is.factor(chr) && assertthat::is.scalar(chr)),
         assertthat::is.number(start) && assertthat::is.number(end),
         assertthat::is.number(min_pts) && min_pts >= 1
     )
 
34abd9e3
     # query data
4dccdae1
     methy_data <- query_methy(x, chr, start, end)
 
     if (nrow(methy_data) == 0) {
         stop(glue::glue("no reads containing methylation data found in specified region"))
     }
 
     methy_data <- methy_data %>%
31ba37bb
         dplyr::filter(.data$pos >= start & .data$pos < end)
078dc579
 
34abd9e3
     read_stats <- get_read_stats(methy_data)
078dc579
 
34abd9e3
     # identify the read names whose span is at least 90% the length of maximum span
     # filter methylation data for only those reads that meet the above condition of span
078dc579
     max_span <- max(read_stats$span)
     keep_reads <- read_stats$read_name[read_stats$span > 0.9 * max_span]
     methy_data <- methy_data %>%
80dfa634
         dplyr::filter(.data$read_name %in% keep_reads)
078dc579
 
34abd9e3
     # convert methylation data into a matrix with one row for each read name
078dc579
     mod_mat <- methy_data %>%
80dfa634
         dplyr::select("read_name", "pos", "mod_prob") %>%
         dplyr::arrange(.data$pos) %>%
         tidyr::pivot_wider(names_from = "pos", values_from = "mod_prob") %>%
078dc579
         df_to_matrix()
 
4dccdae1
     # pre-check before filtering
e05822ed
     if (nrow(mod_mat) < min_pts) {
         stop(glue::glue("fewer reads available ({nrow(mod_mat)} reads) than minimum cluster size 'min_pts' ({min_pts})"))
4dccdae1
     }
 
2261a3b7
     # remove positions with high missingness (>60%) then reads with high missingness (>30%)
078dc579
     mod_mat_filled <- mod_mat[order(rownames(mod_mat)), ]
     col_missingness <- mat_col_map(mod_mat_filled, missingness)
34abd9e3
     mod_mat_filled <- mod_mat_filled[, col_missingness < 0.6]
078dc579
     row_missingness <- mat_row_map(mod_mat_filled, missingness)
34abd9e3
     mod_mat_filled <- mod_mat_filled[row_missingness < 0.3, ]
 
     # fill in missing values with mean methylation probability across that read
19dc3f0c
     for (i in seq_len(nrow(mod_mat_filled))) {
078dc579
         mod_mat_filled[i, is.na(mod_mat_filled[i, ])] <- mean(mod_mat_filled[i, ], na.rm = TRUE)
     }
4dccdae1
 
     # post-check before filtering
     if (nrow(mod_mat_filled) < min_pts) {
         stop(glue::glue("fewer reads available ({nrow(mod_mat_filled)} reads) than minimum cluster size 'min_pts' ({min_pts})"))
     }
078dc579
 
34abd9e3
     # cluster reads using HDBSCAN algorithm with specified minimum number of points
e54891dd
     dbsc <- dbscan::hdbscan(mod_mat_filled, minPts = min_pts)
6a4b7748
     clust_df <- data.frame(read_name = rownames(mod_mat_filled), cluster_id = dbsc$cluster)
078dc579
 
34abd9e3
     # merge and process results of cluster analysis and read statistics
     clust_df %>%
c5486d82
         dplyr::inner_join(read_stats, by = "read_name") %>%
80dfa634
         dplyr::arrange(.data$cluster_id) %>%
e54891dd
         dplyr::mutate(
80dfa634
             cluster_id = as.factor(.data$cluster_id),
             start = as.integer(.data$start),
             end = as.integer(.data$end),
             span = as.integer(.data$span)
e54891dd
         )
078dc579
 }
34abd9e3
 
 # summarize read statistics (start, end, strand) based on same read name
 get_read_stats <- function(methy_data) {
     methy_data %>%
80dfa634
         group_by(.data$read_name) %>%
34abd9e3
         summarise(
80dfa634
             start = min(.data$pos),
             end = max(.data$pos),
             mean = mean(.data$mod_prob, na.rm = TRUE),
             span = .data$end - .data$start,
             strand = unique(.data$strand)
34abd9e3
         ) %>%
80dfa634
         arrange(.data$strand)
34abd9e3
 }