Browse code

Changed averaging to be on methy_prob rather than statistic Added pre-smoothing

shians authored on 22/07/2020 03:24:20
Showing 1 changed files

... ...
@@ -14,15 +14,18 @@ plot_aggregate_regions <- function(x, regions, groups = NULL, flank = 2000, stra
14 14
 
15 15
     assert_that(is_df_or_granges(regions) || is.list(regions))
16 16
 
17
+    # convert GRanges to tibble
17 18
     if (is(regions, "GRanges")) {
18 19
         regions <- tibble::as_tibble(regions) %>%
19 20
             dplyr::rename(chr = "seqnames")
20 21
     }
21 22
 
23
+    # convert data.frame regions to single element list
22 24
     if (!is.null(dim(regions))) {
23 25
         regions <- list(regions)
24 26
     }
25 27
 
28
+    # query methylation data
26 29
     methy_data <- purrr::map(
27 30
         regions,
28 31
         function(features) {
... ...
@@ -42,43 +45,43 @@ plot_aggregate_regions <- function(x, regions, groups = NULL, flank = 2000, stra
42 45
     methy_data <- methy_data %>%
43 46
         dplyr::bind_rows(.id = "group")
44 47
 
45
-    methy_data <- methy_data %>%
48
+    if (!is.null(groups)) {
49
+        methy_data <- methy_data %>%
46 50
         dplyr::group_by(.data$group, .data$rel_pos) %>%
47
-        dplyr::summarise(statistic = mean(.data$statistic)) %>%
48
-        dplyr::mutate(methy_prob = e1071::sigmoid(statistic)) %>%
51
+        dplyr::summarise(methy_prob = mean(.data$methy_prob)) %>%
49 52
         dplyr::ungroup()
50
-
51
-    methy_data_before <- filter(methy_data, rel_pos <= 0)
52
-    if (nrow(methy_data_before) > 10000) {
53
-        methy_data_before <- dplyr::sample_n(methy_data_before, 10000)
54
-    }
55
-
56
-    methy_data_after <- filter(methy_data, rel_pos >= 1)
57
-    if (nrow(methy_data_after) > 10000) {
58
-        methy_data_after <- dplyr::sample_n(methy_data_after, 10000)
59
-    }
60
-
61
-    methy_data <- filter(methy_data, between(rel_pos, 0, 1))
62
-    if (nrow(methy_data) > 10000) {
63
-        methy_data <- dplyr::sample_n(methy_data, 10000)
53
+    } else {
54
+        methy_data <- methy_data %>%
55
+            dplyr::group_by(.data$rel_pos) %>%
56
+            dplyr::summarise(methy_prob = mean(.data$methy_prob)) %>%
57
+            dplyr::ungroup()
64 58
     }
65 59
 
60
+    # set up plot
66 61
     p <- ggplot2::ggplot() +
67 62
         ggplot2::ylim(c(0, 1)) +
68 63
         ggplot2::theme_minimal()
69 64
 
65
+    # take binned means
66
+    grid_size <- 2^10
67
+    binned_pos <- seq(-1.1/3, 1 + 1.1/3, length.out = grid_size+ 2)[2:(1 + grid_size)]
68
+    methy_data <- methy_data %>%
69
+        mutate(interval = cut(rel_pos, breaks = grid_size)) %>%
70
+        group_by(interval, .drop = TRUE) %>%
71
+        summarise(methy_prob = mean(methy_prob)) %>%
72
+        ungroup() %>%
73
+        mutate(rel_pos = binned_pos)
74
+
75
+    span <- 0.10
76
+
70 77
     if (!is.null(groups)) {
71 78
         p <- p +
72
-            .geom_smooth(methy_data, span = 0.3, group = TRUE) +
73
-            .geom_smooth(methy_data_before, span = 0.35, group = TRUE) +
74
-            .geom_smooth(methy_data_after, span = 0.35, group = TRUE) +
79
+            .geom_smooth(methy_data, sspan = span, group = TRUE) +
75 80
             ggplot2::scale_colour_brewer(palette = "Set1") +
76 81
             ggplot2::coord_cartesian(clip = "off")
77 82
     } else {
78 83
         p <- p +
79
-            .geom_smooth(methy_data, span = 0.3, group = FALSE) +
80
-            .geom_smooth(methy_data_before, span = 0.35, group = FALSE) +
81
-            .geom_smooth(methy_data_after, span = 0.35, group = FALSE) +
84
+            .geom_smooth(methy_data, span = span, group = FALSE) +
82 85
             ggplot2::coord_cartesian(clip = "off")
83 86
     }
84 87
 
... ...
@@ -91,8 +94,8 @@ plot_aggregate_regions <- function(x, regions, groups = NULL, flank = 2000, stra
91 94
         geom_vline(xintercept = 1, linetype = "dashed", color = "grey80") +
92 95
         ggplot2::scale_x_continuous(
93 96
             name = "Relative Position",
94
-            breaks = c(-.25, 0, 1, 1.25),
95
-            limits = c(-0.25, 1.25),
97
+            breaks = c(-.33, 0, 1, 1.33),
98
+            limits = c(-0.33, 1.33),
96 99
             labels = labels) +
97 100
         ggplot2::ylab("Average Methylation Probability")
98 101
 }
... ...
@@ -110,8 +113,8 @@ plot_aggregate_regions <- function(x, regions, groups = NULL, flank = 2000, stra
110 113
 
111 114
         out <- numeric(length(x))
112 115
         out[within] <- (x[within] - feature$start) / (feature$end - feature$start)
113
-        out[upstream] <- (x[upstream] - feature$start) / flank / 4
114
-        out[downstream] <- 1 + ((x[downstream] - feature$end) / flank / 4)
116
+        out[upstream] <- (x[upstream] - feature$start) / flank / 3
117
+        out[downstream] <- 1 + ((x[downstream] - feature$end) / flank / 3)
115 118
         if (stranded && length(feature$strand) > 0 && feature$strand == "-") {
116 119
             out <- 1 - out
117 120
         }
... ...
@@ -121,8 +124,8 @@ plot_aggregate_regions <- function(x, regions, groups = NULL, flank = 2000, stra
121 124
     out <- query_methy(
122 125
         methy,
123 126
         features$chr,
124
-        features$start - flank * 1.1,
125
-        features$end + flank * 1.1,
127
+        features$start - flank * 1.10,
128
+        features$end + flank * 1.10,
126 129
         simplify = FALSE)
127 130
 
128 131
     out <- purrr::map2(out, .split_rows(features), function(x, y) {
... ...
@@ -133,8 +136,8 @@ plot_aggregate_regions <- function(x, regions, groups = NULL, flank = 2000, stra
133 136
         x$rel_pos <- .scale_to_feature(x$pos, y, stranded = stranded)
134 137
 
135 138
         x %>%
136
-            group_by(chr, rel_pos) %>%
137
-            summarise(statistic = mean(statistic)) %>%
139
+            group_by(rel_pos) %>%
140
+            summarise(methy_prob = mean(e1071::sigmoid(statistic))) %>%
138 141
             ungroup()
139 142
     })
140 143