Bioconductor Code: NanoMethViz

Raw Blame Patch Log History
# plot methylation data queried using NanoMethViz
plot_methylation_data <- function(
    methy_data,
    sample_anno,
    chr,
    start,
    end,
    title,
    read_anno = NULL,
    group_col = "group",
    palette_col = ggplot2::scale_colour_brewer(palette = "Set1"),
    anno_regions = NULL,
    binary_threshold = NULL,
    avg_method = c("mean", "median"),
    spaghetti = FALSE,
    points = FALSE,
    smoothing_window = 2000,
    highlight_col = getOption("NanoMethViz.highlight_col", "grey50"),
    line_size = 1,
    mod_scale = c(0, 1),
    span = NULL
) {
    if (!missing("span")) {
        warning("the 'span' argument has been deprecated, please use 'smoothing_window' instead")
    }
    # assign averaging method
    avg_method <- match.arg(avg_method)
    avg_func <- switch(
        avg_method,
        mean = confidence_weighted_mean,
        median = median
    )

    # if there are regions to annotate
    if (!is.null(anno_regions)) {
        # filter annotation regions to be within plotting region
        anno_regions <- anno_regions %>%
            dplyr::filter(
                .data$chr == unique(methy_data$chr),
                .data$end >= min(methy_data$pos),
                .data$start <= max(methy_data$pos)
            )
    }

    plot_data <- methy_data %>%
        binarise_statistics(binary_threshold = binary_threshold) %>%
        join_samples_anno(sample_anno) %>%
        add_read_anno(read_anno)

    # set up plot
    # rug first so it appears on the bottom layer
    p <- ggplot(plot_data, aes(x = .data$pos, col = .data[[group_col]])) +
        ggplot2::geom_rug(aes(col = NULL), sides = "b")

    # add annotated regions
    if (!is.null(anno_regions)) {
        for (i in seq_len(nrow(anno_regions))) {
            region <- anno_regions[i, ]
            p <- p +
                ggplot2::annotate(
                    "rect",
                    xmin = region$start,
                    xmax = region$end,
                    ymin = -Inf,
                    ymax = Inf,
                    alpha = 0.2,
                    fill = highlight_col
                )
        }
    }

    # add points
    if (points) {
        p <- p +
            ggplot2::geom_point(
                aes(y = .data$mod_prob),
                alpha = 0.75
            )
    }

    # add spaghetti
    if (spaghetti) {
        p <- p +
            stat_lm(
                aes(y = .data$mod_prob, group = .data$read_name),
                alpha = 0.25,
                na.rm = TRUE
            )
    }

    # add smoothed line
    plot_data_smooth <- plot_data %>%
        dplyr::group_by(.data[[group_col]], .data$pos) %>%
        dplyr::summarise(
            mod_prob = avg_func(.data$mod_prob)
        ) %>%
        dplyr::mutate(
            mod_prob = rolling_average(
                .data$mod_prob,
                .data$pos,
                smoothing_window = smoothing_window
            )
        )

    p <- p +
        ggplot2::geom_line(
            aes(y = .data$mod_prob),
            data = plot_data_smooth,
            na.rm = TRUE,
            linewidth = line_size
        )

    # add auxiliary elements and style
    if (is.numeric(mod_scale) && length(mod_scale) == 2) {
        p <- p + ggplot2::ylim(mod_scale[1], mod_scale[2])
    }
    p +
        ggplot2::ggtitle(title) +
        ggplot2::xlab(chr) +
        palette_col +
        ggplot2::theme_bw() +
        ggplot2::theme(panel.grid.minor = ggplot2::element_blank()) +
        ggplot2::scale_x_continuous(labels = scales::label_number(scale_cut = scales::cut_si("b")))
}

binarise_statistics <- function(plot_data, binary_threshold) {
    if (!is.null(binary_threshold)) {
        # if binary threshold is provided, convert probabilities to binary values
        plot_data$mod_prob <- as.numeric(
            sigmoid(plot_data$statistic) > binary_threshold
        )
    }
    plot_data
}

join_samples_anno <- function(plot_data, sample_anno) {
    plot_data %>%
        dplyr::inner_join(sample_anno, by = "sample", multiple = "all")
}

add_read_anno <- function(plot_data, read_anno) {
    # incorporate read annotation if available
    if (!is.null(read_anno)) {
        # remove any duplicate columns and use plot_data as reference
        plot_data_names <- setdiff(names(plot_data), "read_name")
        read_anno <- dplyr::select(read_anno, -dplyr::any_of(plot_data_names))
        plot_data <- dplyr::inner_join(plot_data, read_anno, by = "read_name") %>%
            tidyr::drop_na()
    }

    plot_data
}

confidence_weighted_mean <- function(x, threshold = 0.5) {
    weights <- abs(x - threshold)^2
    weights <- weights / sum(weights)
    sum(x * weights)
}

rolling_average <- function(y, x, smoothing_window = 2000, neighbours = smoothing_window / 10) {
    tricube_kern <- function(x) {
        ifelse(abs(x) < 1, (1 - abs(x))^3, 0)
    }

    n <- length(y)
    y_smooth <- rep(NA, n)
    for (i in 1:n) {
        start <- max(1, i - neighbours)
        end <- min(n, i + neighbours)
        dists <- abs(x[start:end] - x[i])
        weights <- tricube_kern(dists / smoothing_window)
        weights <- weights / sum(weights)

        y_smooth[i] <- sum(y[start:end] * weights)
    }

    y_smooth
}