Bioconductor Code: NanoMethViz

R/cluster_reads.R

34abd9e3	#' Cluster reads based on methylation #' #' @param x a ModBamResult object. #' @param chr the chromosome name where to find the region. #' @param start the start position of the region. #' @param end the end position of the region. #' @param min_pts the minimum number of points needed to form a cluster (default = 10). #' #' @return A tibble with information about each read's cluster assignment and read statistics. #' #' @import tidyr #' @import dplyr #' @import dbscan #' @importFrom tibble rownames_to_column
41085636	cluster_reads <- function(x, chr, start, end, min_pts = 5) {
22cea658	assertthat::assert_that( is(x, "ModBamResult"), assertthat::is.string(chr) \|\| (is.factor(chr) && assertthat::is.scalar(chr)), assertthat::is.number(start) && assertthat::is.number(end), assertthat::is.number(min_pts) && min_pts >= 1 )
34abd9e3	# query data
4dccdae1	methy_data <- query_methy(x, chr, start, end) if (nrow(methy_data) == 0) { stop(glue::glue("no reads containing methylation data found in specified region")) } methy_data <- methy_data %>%
31ba37bb	dplyr::filter(.data$pos >= start & .data$pos < end)
078dc579
34abd9e3	read_stats <- get_read_stats(methy_data)
078dc579
34abd9e3	# identify the read names whose span is at least 90% the length of maximum span # filter methylation data for only those reads that meet the above condition of span
078dc579	max_span <- max(read_stats$span) keep_reads <- read_stats$read_name[read_stats$span > 0.9 * max_span] methy_data <- methy_data %>%
80dfa634	dplyr::filter(.data$read_name %in% keep_reads)
078dc579
34abd9e3	# convert methylation data into a matrix with one row for each read name
078dc579	mod_mat <- methy_data %>%
80dfa634	dplyr::select("read_name", "pos", "mod_prob") %>% dplyr::arrange(.data$pos) %>% tidyr::pivot_wider(names_from = "pos", values_from = "mod_prob") %>%
078dc579	df_to_matrix()
4dccdae1	# pre-check before filtering
e05822ed	if (nrow(mod_mat) < min_pts) { stop(glue::glue("fewer reads available ({nrow(mod_mat)} reads) than minimum cluster size 'min_pts' ({min_pts})"))
4dccdae1	}
2261a3b7	# remove positions with high missingness (>60%) then reads with high missingness (>30%)
078dc579	mod_mat_filled <- mod_mat[order(rownames(mod_mat)), ] col_missingness <- mat_col_map(mod_mat_filled, missingness)
34abd9e3	mod_mat_filled <- mod_mat_filled[, col_missingness < 0.6]
078dc579	row_missingness <- mat_row_map(mod_mat_filled, missingness)
34abd9e3	mod_mat_filled <- mod_mat_filled[row_missingness < 0.3, ] # fill in missing values with mean methylation probability across that read
19dc3f0c	for (i in seq_len(nrow(mod_mat_filled))) {
078dc579	mod_mat_filled[i, is.na(mod_mat_filled[i, ])] <- mean(mod_mat_filled[i, ], na.rm = TRUE) }
4dccdae1	# post-check before filtering if (nrow(mod_mat_filled) < min_pts) { stop(glue::glue("fewer reads available ({nrow(mod_mat_filled)} reads) than minimum cluster size 'min_pts' ({min_pts})")) }
078dc579
34abd9e3	# cluster reads using HDBSCAN algorithm with specified minimum number of points
e54891dd	dbsc <- dbscan::hdbscan(mod_mat_filled, minPts = min_pts)
6a4b7748	clust_df <- data.frame(read_name = rownames(mod_mat_filled), cluster_id = dbsc$cluster)
078dc579
34abd9e3	# merge and process results of cluster analysis and read statistics clust_df %>%
c5486d82	dplyr::inner_join(read_stats, by = "read_name") %>%
80dfa634	dplyr::arrange(.data$cluster_id) %>%
e54891dd	dplyr::mutate(
80dfa634	cluster_id = as.factor(.data$cluster_id), start = as.integer(.data$start), end = as.integer(.data$end), span = as.integer(.data$span)
e54891dd	)
078dc579	}
34abd9e3	# summarize read statistics (start, end, strand) based on same read name get_read_stats <- function(methy_data) { methy_data %>%
80dfa634	group_by(.data$read_name) %>%
34abd9e3	summarise(
80dfa634	start = min(.data$pos), end = max(.data$pos), mean = mean(.data$mod_prob, na.rm = TRUE), span = .data$end - .data$start, strand = unique(.data$strand)
34abd9e3	) %>%
80dfa634	arrange(.data$strand)
34abd9e3	}