Bioconductor Code: NanoMethViz

Raw Blame Patch Log History
#' Cluster reads based on methylation
#'
#' @param x a ModBamResult object.
#' @param chr the chromosome name where to find the region.
#' @param start the start position of the region.
#' @param end the end position of the region.
#' @param min_pts the minimum number of points needed to form a cluster (default = 10).
#'
#' @return A tibble with information about each read's cluster assignment and read statistics.
#'
#' @import tidyr
#' @import dplyr
#' @import dbscan
#' @importFrom tibble rownames_to_column
cluster_reads <- function(x, chr, start, end, min_pts = 5) {
    assertthat::assert_that(
        is(x, "ModBamResult"),
        assertthat::is.string(chr) || (is.factor(chr) && assertthat::is.scalar(chr)),
        assertthat::is.number(start) && assertthat::is.number(end),
        assertthat::is.number(min_pts) && min_pts >= 1
    )

    # query data
    methy_data <- query_methy(x, chr, start, end)

    if (nrow(methy_data) == 0) {
        stop(glue::glue("no reads containing methylation data found in specified region"))
    }

    methy_data <- methy_data %>%
        dplyr::filter(.data$pos >= start & .data$pos < end)

    read_stats <- get_read_stats(methy_data)

    # identify the read names whose span is at least 90% the length of maximum span
    # filter methylation data for only those reads that meet the above condition of span
    max_span <- max(read_stats$span)
    keep_reads <- read_stats$read_name[read_stats$span > 0.9 * max_span]
    methy_data <- methy_data %>%
        dplyr::filter(.data$read_name %in% keep_reads)

    # convert methylation data into a matrix with one row for each read name
    mod_mat <- methy_data %>%
        dplyr::select("read_name", "pos", "mod_prob") %>%
        dplyr::arrange(.data$pos) %>%
        tidyr::pivot_wider(names_from = "pos", values_from = "mod_prob") %>%
        df_to_matrix()

    # pre-check before filtering
    if (nrow(mod_mat) < min_pts) {
        stop(glue::glue("fewer reads available ({nrow(mod_mat)} reads) than minimum cluster size 'min_pts' ({min_pts})"))
    }

    # remove positions with high missingness (>60%) then reads with high missingness (>30%)
    mod_mat_filled <- mod_mat[order(rownames(mod_mat)), ]
    col_missingness <- mat_col_map(mod_mat_filled, missingness)
    mod_mat_filled <- mod_mat_filled[, col_missingness < 0.6]
    row_missingness <- mat_row_map(mod_mat_filled, missingness)
    mod_mat_filled <- mod_mat_filled[row_missingness < 0.3, ]

    # fill in missing values with mean methylation probability across that read
    for (i in seq_len(nrow(mod_mat_filled))) {
        mod_mat_filled[i, is.na(mod_mat_filled[i, ])] <- mean(mod_mat_filled[i, ], na.rm = TRUE)
    }

    # post-check before filtering
    if (nrow(mod_mat_filled) < min_pts) {
        stop(glue::glue("fewer reads available ({nrow(mod_mat_filled)} reads) than minimum cluster size 'min_pts' ({min_pts})"))
    }

    # cluster reads using HDBSCAN algorithm with specified minimum number of points
    dbsc <- dbscan::hdbscan(mod_mat_filled, minPts = min_pts)
    clust_df <- data.frame(read_name = rownames(mod_mat_filled), cluster_id = dbsc$cluster)

    # merge and process results of cluster analysis and read statistics
    clust_df %>%
        dplyr::inner_join(read_stats, by = "read_name") %>%
        dplyr::arrange(.data$cluster_id) %>%
        dplyr::mutate(
            cluster_id = as.factor(.data$cluster_id),
            start = as.integer(.data$start),
            end = as.integer(.data$end),
            span = as.integer(.data$span)
        )
}

# summarize read statistics (start, end, strand) based on same read name
get_read_stats <- function(methy_data) {
    methy_data %>%
        group_by(.data$read_name) %>%
        summarise(
            start = min(.data$pos),
            end = max(.data$pos),
            mean = mean(.data$mod_prob, na.rm = TRUE),
            span = .data$end - .data$start,
            strand = unique(.data$strand)
        ) %>%
        arrange(.data$strand)
}