Browse code

Added get_read_entropy feature

Shians authored on 09/06/2023 03:02:40
Showing 3 changed files

... ...
@@ -5,6 +5,10 @@ convert_methy_to_dss_cpp <- function(input, output_dir) {
5 5
     .Call('_NanoMethViz_convert_methy_to_dss_cpp', PACKAGE = 'NanoMethViz', input, output_dir)
6 6
 }
7 7
 
8
+count_cg_cpp <- function(str) {
9
+    .Call('_NanoMethViz_count_cg_cpp', PACKAGE = 'NanoMethViz', str)
10
+}
11
+
8 12
 get_char_pos_cpp <- function(x, c) {
9 13
     .Call('_NanoMethViz_get_char_pos_cpp', PACKAGE = 'NanoMethViz', x, c)
10 14
 }
11 15
new file mode 100644
... ...
@@ -0,0 +1,53 @@
1
+get_read_entropy <- function(bam_path, sample = fs::path_file(bam_path)) {
2
+    # helper functions ----
3
+    count_crossings <- function(numbers) {
4
+        crossings <- sum(((numbers[-1] - 127.5) * (numbers[-length(numbers)] - 127.5)) < 0)
5
+        return(crossings)
6
+    }
7
+
8
+    read_chunk <- function(bam_file) {
9
+        Rsamtools::scanBam(bam_file, param = modbam_param())[[1]]
10
+    }
11
+
12
+    parse_chunk <- function(reads, sample) {
13
+        reads <- reads[!is.null(reads$seq)]
14
+        tibble::tibble(
15
+            sample = factor(sample),
16
+            read_name = reads$qname,
17
+            chr = factor(reads$rname),
18
+            pos = reads$pos,
19
+            width = Biostrings::width(reads$seq),
20
+            center = as.integer(pos + width/2),
21
+            cg_count = map_int(as.character(reads$seq), count_cg),
22
+            crossings = purrr::map_int(reads$tag$ML, count_crossings),
23
+            entropy = crossings / cg_count
24
+        )
25
+    }
26
+
27
+    # function body ----
28
+    fname <- fs::path_file(bam_path)
29
+        cli::cli_progress_bar(
30
+            glue::glue("Parsing file: {fname}"),
31
+            total = get_bam_total_reads(bam_path),
32
+            format_done = paste0(
33
+                "{.alert-success Data parsed: ", fname, " {.timestamp {cli::pb_elapsed}}}"),
34
+            format_failed = paste0(
35
+                "{.alert-danger Data parsing failed: ", fname, " {.timestamp {cli::pb_elapsed}}}"),
36
+            clear = FALSE
37
+        )
38
+
39
+    bam_file <- Rsamtools::BamFile(bam_path, yieldSize = 15000)
40
+
41
+    i <- 1
42
+    df_list <- list()
43
+    open(bam_file)
44
+    while (Rsamtools::isIncomplete(bam_file)) {
45
+        reads <- read_chunk(bam_file)
46
+        df_list[[i]] <- parse_chunk(reads, sample)
47
+        i <- i + 1
48
+        cli::cli_progress_update(length(reads[[1]][[1]]))
49
+    }
50
+    close(bam_file)
51
+
52
+    bind_rows(df_list)
53
+}
0 54
new file mode 100644
... ...
@@ -0,0 +1,22 @@
1
+#include <string>
2
+#include <Rcpp.h>
3
+
4
+using namespace std;
5
+using namespace Rcpp;
6
+
7
+// function to count "CG" dinucleotides in a DNA string
8
+// [[Rcpp::export]]
9
+int count_cg_cpp(std::string str) {
10
+    int count = 0;
11
+    int length = str.length();
12
+
13
+    // loop through string
14
+    for (int i = 0; i < length - 1; i++) {
15
+        // if "CG" found, increment count
16
+        if (str[i] == 'C' && str[i + 1] == 'G') {
17
+            count++;
18
+        }
19
+    }
20
+
21
+    return count;
22
+}