... | ... |
@@ -5,6 +5,10 @@ convert_methy_to_dss_cpp <- function(input, output_dir) { |
5 | 5 |
.Call('_NanoMethViz_convert_methy_to_dss_cpp', PACKAGE = 'NanoMethViz', input, output_dir) |
6 | 6 |
} |
7 | 7 |
|
8 |
+count_cg_cpp <- function(str) { |
|
9 |
+ .Call('_NanoMethViz_count_cg_cpp', PACKAGE = 'NanoMethViz', str) |
|
10 |
+} |
|
11 |
+ |
|
8 | 12 |
get_char_pos_cpp <- function(x, c) { |
9 | 13 |
.Call('_NanoMethViz_get_char_pos_cpp', PACKAGE = 'NanoMethViz', x, c) |
10 | 14 |
} |
11 | 15 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,53 @@ |
1 |
+get_read_entropy <- function(bam_path, sample = fs::path_file(bam_path)) { |
|
2 |
+ # helper functions ---- |
|
3 |
+ count_crossings <- function(numbers) { |
|
4 |
+ crossings <- sum(((numbers[-1] - 127.5) * (numbers[-length(numbers)] - 127.5)) < 0) |
|
5 |
+ return(crossings) |
|
6 |
+ } |
|
7 |
+ |
|
8 |
+ read_chunk <- function(bam_file) { |
|
9 |
+ Rsamtools::scanBam(bam_file, param = modbam_param())[[1]] |
|
10 |
+ } |
|
11 |
+ |
|
12 |
+ parse_chunk <- function(reads, sample) { |
|
13 |
+ reads <- reads[!is.null(reads$seq)] |
|
14 |
+ tibble::tibble( |
|
15 |
+ sample = factor(sample), |
|
16 |
+ read_name = reads$qname, |
|
17 |
+ chr = factor(reads$rname), |
|
18 |
+ pos = reads$pos, |
|
19 |
+ width = Biostrings::width(reads$seq), |
|
20 |
+ center = as.integer(pos + width/2), |
|
21 |
+ cg_count = map_int(as.character(reads$seq), count_cg), |
|
22 |
+ crossings = purrr::map_int(reads$tag$ML, count_crossings), |
|
23 |
+ entropy = crossings / cg_count |
|
24 |
+ ) |
|
25 |
+ } |
|
26 |
+ |
|
27 |
+ # function body ---- |
|
28 |
+ fname <- fs::path_file(bam_path) |
|
29 |
+ cli::cli_progress_bar( |
|
30 |
+ glue::glue("Parsing file: {fname}"), |
|
31 |
+ total = get_bam_total_reads(bam_path), |
|
32 |
+ format_done = paste0( |
|
33 |
+ "{.alert-success Data parsed: ", fname, " {.timestamp {cli::pb_elapsed}}}"), |
|
34 |
+ format_failed = paste0( |
|
35 |
+ "{.alert-danger Data parsing failed: ", fname, " {.timestamp {cli::pb_elapsed}}}"), |
|
36 |
+ clear = FALSE |
|
37 |
+ ) |
|
38 |
+ |
|
39 |
+ bam_file <- Rsamtools::BamFile(bam_path, yieldSize = 15000) |
|
40 |
+ |
|
41 |
+ i <- 1 |
|
42 |
+ df_list <- list() |
|
43 |
+ open(bam_file) |
|
44 |
+ while (Rsamtools::isIncomplete(bam_file)) { |
|
45 |
+ reads <- read_chunk(bam_file) |
|
46 |
+ df_list[[i]] <- parse_chunk(reads, sample) |
|
47 |
+ i <- i + 1 |
|
48 |
+ cli::cli_progress_update(length(reads[[1]][[1]])) |
|
49 |
+ } |
|
50 |
+ close(bam_file) |
|
51 |
+ |
|
52 |
+ bind_rows(df_list) |
|
53 |
+} |
0 | 54 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,22 @@ |
1 |
+#include <string> |
|
2 |
+#include <Rcpp.h> |
|
3 |
+ |
|
4 |
+using namespace std; |
|
5 |
+using namespace Rcpp; |
|
6 |
+ |
|
7 |
+// function to count "CG" dinucleotides in a DNA string |
|
8 |
+// [[Rcpp::export]] |
|
9 |
+int count_cg_cpp(std::string str) { |
|
10 |
+ int count = 0; |
|
11 |
+ int length = str.length(); |
|
12 |
+ |
|
13 |
+ // loop through string |
|
14 |
+ for (int i = 0; i < length - 1; i++) { |
|
15 |
+ // if "CG" found, increment count |
|
16 |
+ if (str[i] == 'C' && str[i + 1] == 'G') { |
|
17 |
+ count++; |
|
18 |
+ } |
|
19 |
+ } |
|
20 |
+ |
|
21 |
+ return count; |
|
22 |
+} |