#' Cluster reads based on methylation #' #' @param x a ModBamResult object. #' @param chr the chromosome name where to find the region. #' @param start the start position of the region. #' @param end the end position of the region. #' @param min_pts the minimum number of points needed to form a cluster (default = 10). #' #' @return A tibble with information about each read's cluster assignment and read statistics. #' #' @import tidyr #' @import dplyr #' @import dbscan #' @importFrom tibble rownames_to_column cluster_reads <- function(x, chr, start, end, min_pts = 5) { assertthat::assert_that( is(x, "ModBamResult"), assertthat::is.string(chr) || (is.factor(chr) && assertthat::is.scalar(chr)), assertthat::is.number(start) && assertthat::is.number(end), assertthat::is.number(min_pts) && min_pts >= 1 ) # query data methy_data <- query_methy(x, chr, start, end) if (nrow(methy_data) == 0) { stop(glue::glue("no reads containing methylation data found in specified region")) } methy_data <- methy_data %>% dplyr::filter(.data$pos >= start & .data$pos < end) read_stats <- get_read_stats(methy_data) # identify the read names whose span is at least 90% the length of maximum span # filter methylation data for only those reads that meet the above condition of span max_span <- max(read_stats$span) keep_reads <- read_stats$read_name[read_stats$span > 0.9 * max_span] methy_data <- methy_data %>% dplyr::filter(.data$read_name %in% keep_reads) # convert methylation data into a matrix with one row for each read name mod_mat <- methy_data %>% dplyr::select("read_name", "pos", "mod_prob") %>% dplyr::arrange(.data$pos) %>% tidyr::pivot_wider(names_from = "pos", values_from = "mod_prob") %>% df_to_matrix() # pre-check before filtering if (nrow(mod_mat) < min_pts) { stop(glue::glue("fewer reads available ({nrow(mod_mat)} reads) than minimum cluster size 'min_pts' ({min_pts})")) } # remove positions with high missingness (>60%) then reads with high missingness (>30%) mod_mat_filled <- mod_mat[order(rownames(mod_mat)), ] col_missingness <- mat_col_map(mod_mat_filled, missingness) mod_mat_filled <- mod_mat_filled[, col_missingness < 0.6] row_missingness <- mat_row_map(mod_mat_filled, missingness) mod_mat_filled <- mod_mat_filled[row_missingness < 0.3, ] # fill in missing values with mean methylation probability across that read for (i in seq_len(nrow(mod_mat_filled))) { mod_mat_filled[i, is.na(mod_mat_filled[i, ])] <- mean(mod_mat_filled[i, ], na.rm = TRUE) } # post-check before filtering if (nrow(mod_mat_filled) < min_pts) { stop(glue::glue("fewer reads available ({nrow(mod_mat_filled)} reads) than minimum cluster size 'min_pts' ({min_pts})")) } # cluster reads using HDBSCAN algorithm with specified minimum number of points dbsc <- dbscan::hdbscan(mod_mat_filled, minPts = min_pts) clust_df <- data.frame(read_name = rownames(mod_mat_filled), cluster_id = dbsc$cluster) # merge and process results of cluster analysis and read statistics clust_df %>% dplyr::inner_join(read_stats, by = "read_name") %>% dplyr::arrange(.data$cluster_id) %>% dplyr::mutate( cluster_id = as.factor(.data$cluster_id), start = as.integer(.data$start), end = as.integer(.data$end), span = as.integer(.data$span) ) } # summarize read statistics (start, end, strand) based on same read name get_read_stats <- function(methy_data) { methy_data %>% group_by(.data$read_name) %>% summarise( start = min(.data$pos), end = max(.data$pos), mean = mean(.data$mod_prob, na.rm = TRUE), span = .data$end - .data$start, strand = unique(.data$strand) ) %>% arrange(.data$strand) }