R/plot_agg_regions.R
38c06b3c
 #' Plot aggregate regions
 #'
78cc3174
 #' @param x the NanoMethResult object.
1fbaedd3
 #' @param regions a table of regions containing at least columns chr, strand,
62371c8b
 #'   start and end. Any additional columns can be used for grouping.
3e7ffe34
 #' @param binary_threshold the modification probability such that calls with
 #'   modification probability above the threshold are considered methylated, and
 #'   those with probability equal or below are considered unmethylated.
1fbaedd3
 #' @param group_col the column to group aggregated trends by. This column can
 #'   be in from the regions table or samples(x).
bba03c09
 #' @param flank the number of flanking bases to add to each side of each region.
f8f55443
 #' @param stranded TRUE if negative strand features should have coordinates
 #'   flipped to reflect features like transcription start sites.
78cc3174
 #' @param span the span for loess smoothing.
4029077d
 #' @param palette the ggplot colour palette used for groups.
38c06b3c
 #'
a8c880e0
 #' @return a ggplot object containing the aggregate methylation trend.
1f94de93
 #'
302f0646
 #' @examples
 #' nmr <- load_example_nanomethresult()
f9958e01
 #' gene_anno <- exons_to_genes(NanoMethViz::exons(nmr))
 #' plot_agg_regions(nmr, gene_anno)
1fbaedd3
 #' plot_agg_regions(nmr, gene_anno, group_col = "sample")
 #' plot_agg_regions(nmr, gene_anno, group_col = "group")
302f0646
 #'
38c06b3c
 #' @export
78cc3174
 plot_agg_regions <- function(
     x,
     regions,
3e7ffe34
     binary_threshold = 0.5,
1fbaedd3
     group_col = NULL,
78cc3174
     flank = 2000,
fddf8b3c
     stranded = TRUE,
f9958e01
     span = 0.05,
     palette = ggplot2::scale_colour_brewer(palette = "Set1")
78cc3174
 ) {
1fbaedd3
     if (!is.null(group_col)) {
         avail_columns <- c(colnames(samples(x)), colnames(regions))
         assertthat::assert_that(
             group_col %in% avail_columns,
             msg = glue::glue("'{group_col}' could not be found in columns of 'regions' or samples(x)")
         )
     }
38c06b3c
 
35cc9b98
     # grouped regions crashes downstream operations
     regions <- ungroup(regions)
 
5657fa9e
     # query methylation data
b93920f3
     methy_data <- plot_agg_regions.get_methy_data(x, regions, flank)
 
     # process each methy_data list element to obtain methylation proportions at relative positions
     methy_data <- plot_agg_regions.process_methy_data(methy_data, stranded, flank, binary_threshold)
 
     # unnest methy_data and add sample annotation
     methy_data <- plot_agg_regions.unnest_with_anno(methy_data, NanoMethViz::samples(x))
 
     # take the average methylation proportion over equal sized positional bins
     methy_data <- plot_agg_regions.avg_over_bins(methy_data, group_col = group_col)
302f0646
 
21d50658
     # change aes spec depending on if group_col is available
     # hacky fix to the deprecation of aes_string which handled a NULL group_col
     # value
     if (!is.null(group_col)) {
         aes_spec <- ggplot2::aes(
                 x = .data$binned_pos,
                 y = .data$methy_prop,
                 group = .data[[group_col]],
                 col = .data[[group_col]])
     } else {
         aes_spec <- ggplot2::aes(
                 x = .data$binned_pos,
                 y = .data$methy_prop)
     }
 
5657fa9e
     # set up plot
1863d990
     p <- ggplot2::ggplot() +
38c06b3c
         ggplot2::ylim(c(0, 1)) +
4c61ee9d
         ggplot2::theme_bw() +
1fbaedd3
         ggplot2::stat_smooth(
21d50658
             aes_spec,
1fbaedd3
             method = "loess",
             formula = "y ~ x",
             span = span,
             na.rm = TRUE,
             se = FALSE,
             data = methy_data) +
f9958e01
         palette
38c06b3c
 
b93920f3
     # if flank is 0, then then only annotated start and end, else annotate flanks as well
f9958e01
     if (flank == 0) {
         labels <- c("start", "end")
4029077d
         breaks <- c(0, 1)
         limits <- c(0, 1)
f9958e01
     } else {
4029077d
         breaks <- c(-.33, 0, 1, 1.33)
         limits <- c(-0.33, 1.33)
f9958e01
         kb_marker <- round(flank / 1000, 1)
         labels <- c(glue::glue("-{kb_marker}kb"), "start", "end", glue::glue("+{kb_marker}kb"))
         p <- p +
             ggplot2::geom_vline(xintercept = 0, linetype = "dashed", color = "grey80") +
             ggplot2::geom_vline(xintercept = 1, linetype = "dashed", color = "grey80")
38c06b3c
     }
 
35cc9b98
     region_widths <- regions$end - regions$start
     width_avg <- round(mean(region_widths))
4e46ddd9
     width_std_dev <- round(sd(region_widths))
35cc9b98
 
302f0646
     p + ggplot2::coord_cartesian(clip = "off") +
bba03c09
         ggplot2::scale_x_continuous(
35cc9b98
             name = glue::glue("Relative Position (Avg.Width = {width_avg}, Std.Dev = {width_std_dev})"),
f9958e01
             breaks = breaks,
             limits = limits,
a6df6e5a
             labels = labels) +
f9958e01
         ggplot2::ylab("Average Methylation Proportion")
38c06b3c
 }
 
b93920f3
 plot_agg_regions.get_methy_data <- function(x, regions, flank) {
35cc9b98
     query_row_methy <- function(i, methy, regions, flank) {
         # query a larger region such that smoothing doesn't
         # misbehave around ends
         flank <- flank * 1.1
         query_methy(
             methy,
             regions$chr[i],
             regions$start[i] - flank,
             regions$end[i] + flank,
             force = TRUE)
     }
 
     methy_data <- purrr::map(
         seq_len(nrow(regions)),
         query_row_methy,
4000496d
         methy = x,
35cc9b98
         regions = regions,
         flank = flank
     )
 
19dc3f0c
     regions %>%
35cc9b98
         mutate(methy_data = methy_data)
 }
 
b93920f3
 plot_agg_regions.process_methy_data <- function(methy_data, stranded, flank, binary_threshold) {
38c06b3c
 
b93920f3
     remove_empty_methy_data <- function(x) {
         x %>%
             dplyr::filter(purrr::map_lgl(.data$methy_data, ~nrow(.x) > 0))
     }
 
     rescale_positions <- function(x, stranded, flank) {
         # for each region, scale the position values to be between 0 and 1
         for (i in seq_len(nrow(x))) {
             # determine if the positions fall inside the region
             m_data <- x$methy_data[[i]]$pos
             within <- m_data >= x$start[i] & m_data <= x$end[i]
             upstream <- m_data < x$start[i]
             downstream <- m_data > x$end[i]
 
             # rescale the positions depending on the region
             rel_pos <- numeric(length(m_data))
             rel_pos[within] <- (m_data[within] - x$start[i]) / (x$end[i] - x$start[i])
 
             # flanks each take up 1/3 of the plot space
             rel_pos[upstream] <- (m_data[upstream] - x$start[i]) / flank / 3
             rel_pos[downstream] <- 1 + ((m_data[downstream] - x$end[i]) / flank / 3)
 
             # flip positions for negative strand if stranded is TRUE
             if (stranded && length(x$strand[i]) > 0 && x$strand[i] == "-") {
                 rel_pos <- 1 - rel_pos
             }
 
             # store rescaled positions
             x$methy_data[[i]]$rel_pos <- rel_pos
bba03c09
         }
302f0646
 
b93920f3
         x
d4c97256
     }
 
b93920f3
     binarise_statistics <- function(x, binary_threshold) {
         binarise_statistc_in_df <- function(x, binary_threshold) {
             x %>%
                 dplyr::mutate(is_methylated = e1071::sigmoid(.data$statistic) > binary_threshold)
         }
 
         x %>%
             dplyr::mutate(
                 methy_data = purrr::map(
                     .data$methy_data,
                     binarise_statistc_in_df, binary_threshold = binary_threshold
                 )
             )
     }
 
     average_binarised_methy_per_pos <- function(x) {
         avg_methy_by_rel_pos <- function(x) {
             x %>%
                 dplyr::group_by(.data$sample, .data$chr, .data$strand, .data$rel_pos) %>%
                 dplyr::summarise(
                     methy_prop = mean(.data$is_methylated),
                     .groups = "drop")
         }
5cf99170
 
35cc9b98
         x %>%
b93920f3
             dplyr::mutate(
                 methy_data = purrr::map(.data$methy_data, avg_methy_by_rel_pos)
             )
35cc9b98
     }
 
b93920f3
     methy_data %>%
         remove_empty_methy_data() %>%
         rescale_positions(stranded, flank) %>%
         binarise_statistics(binary_threshold) %>%
         average_binarised_methy_per_pos()
5cf99170
 }
f9958e01
 
b93920f3
 
 plot_agg_regions.unnest_with_anno <- function(x, samples_anno) {
1fbaedd3
     x %>%
         dplyr::select(!dplyr::any_of(c("chr", "strand", "start", "end"))) %>%
         tidyr::unnest("methy_data") %>%
a4ffe4c6
         dplyr::inner_join(samples_anno, by = "sample", multiple = "all")
f9958e01
 }
 
b93920f3
 plot_agg_regions.avg_over_bins <- function(x, group_col, grid_size = 2^10) {
19dc3f0c
     min <- -1.1 / 3
     max <- 1 + 1.1 / 3
1fbaedd3
     binned_pos <- seq(min, max, length.out = grid_size + 1)
     binned_pos_df <- data.frame(
         binned_pos = binned_pos[-1],
         interval = cut(binned_pos, breaks = binned_pos)[-1]
f9958e01
     )
 
1fbaedd3
     x %>%
         dplyr::mutate(interval = cut(.data$rel_pos, breaks = binned_pos)) %>%
a4ffe4c6
         dplyr::inner_join(binned_pos_df, by = "interval", multiple = "all") %>%
1fbaedd3
         dplyr::group_by(dplyr::across(dplyr::all_of(group_col)), binned_pos) %>%
         dplyr::summarise(methy_prop = mean(.data$methy_prop), .groups = "drop")
f9958e01
 }