Browse code

Update doc and format processPileupChrBin() function

adeschen authored on 11/04/2025 17:28:56
Showing 2 changed files

... ...
@@ -478,42 +478,34 @@ extractNucleotide <- function(nucleotide, count, curNucleo) {
478 478
 }
479 479
 
480 480
 
481
-#' @title Read a VCF file with the genotypes use for the ancestry call
481
+#' @title Extract SNV information from pileup file for a selected chromosome
482 482
 #'
483
-#' @description The function reads VCF file and
484
-#' returns a data frame
483
+#' @description The function reads pileup file and
484
+#' returns a \code{data.frame}
485 485
 #' containing the information about the read counts for the SNVs present in
486
-#' the file.
487
-#'
488
-#' @param chr a \code{character} string representing the name, including
489
-#' the path, of a VCF file containing the SNV read counts.
490
-#' The VCF must contain those genotype fields: GT, AD, DP.
486
+#' the selected chromosome.
491 487
 #'
488
+#' @param chr a \code{character} string representing the name of the 
489
+#' chromosome to keep
492 490
 #'
493
-#' @param resPileup result from pileup
491
+#' @param resPileup a \code{data.frame} as generated by the \code{pileup} 
492
+#' function from \code{Rsamtools} package
494 493
 #'
495
-#' @param varDf a \code{data.frame} representing the position to keep
494
+#' @param varDf a \code{data.frame} representing the positions to keep
496 495
 #'
497 496
 #' @param verbose a \code{logical} indicating if messages should be printed
498
-#' to show how the different steps in the function. Default: \code{FALSE}.
499 497
 #'
500 498
 #' @return a \code{data.frame} containing at least:
501 499
 #' \describe{
502
-#' \item{seqnames}{ a  \code{character} representing the name of
503
-#' the chromosome}
504
-#' \item{pos}{ a \code{numeric} representing the position on the
505
-#' chromosome}
500
+#' \item{seqnames}{ a \code{character} representing the name of the chromosome}
501
+#' \item{pos}{ a \code{numeric} representing the position on the chromosome}
506 502
 #' \item{REF}{ a \code{character} string representing the reference nucleotide}
507 503
 #' \item{ALT}{ a \code{character} string representing the alternative
508 504
 #' nucleotide}
509
-#' \item{A}{ a \code{numeric} representing the count for
510
-#' the A nucleotide}
511
-#' \item{C}{ a \code{numeric} representing the count for
512
-#' the C nucleotide}
513
-#' \item{G}{ a \code{numeric} representing the count for
514
-#' the G nucleotide}
515
-#' \item{T}{ a \code{numeric} representing the count for
516
-#' the T nucleotide}
505
+#' \item{A}{ a \code{numeric} representing the count for the A nucleotide}
506
+#' \item{C}{ a \code{numeric} representing the count for the C nucleotide}
507
+#' \item{G}{ a \code{numeric} representing the count for the G nucleotide}
508
+#' \item{T}{ a \code{numeric} representing the count for the T nucleotide}
517 509
 #' \item{count}{ a \code{numeric} representing the total count}
518 510
 #' }
519 511
 #'
... ...
@@ -528,16 +520,20 @@ extractNucleotide <- function(nucleotide, count, curNucleo) {
528 520
 #' @importFrom rlang .data
529 521
 #' @encoding UTF-8
530 522
 #' @keywords internal
531
-processPileupChrBin <- function(chr,
532
-                            resPileup, varDf,
533
-                            verbose=FALSE) {
523
+processPileupChrBin <- function(chr, resPileup, varDf, verbose) {
534 524
 
535 525
     resCur <- NULL
536
-    if(chr %in% names(varDf)){
526
+    
527
+    ## Assign FALSE to verbose by default
528
+    if (is.null(verbose)) {
529
+        verbose <- FALSE
530
+    }
531
+    
532
+    if (chr %in% names(varDf)) {
537 533
 
538 534
         keep <- which(resPileup$seqnames == chr)
539 535
 
540
-        if(length(keep) > 0){
536
+        if (length(keep) > 0) {
541 537
             # restrict the resPileup to the chromosome chr
542 538
             snpO <- resPileup[keep,]
543 539
             rm(keep)
... ...
@@ -546,57 +542,55 @@ processPileupChrBin <- function(chr,
546 542
             vcfCur <- varDf[[chr]][varDf[[chr]]$start >= min(snpO$pos) &
547 543
                                      varDf[[chr]]$start <= max(snpO$pos),]
548 544
 
549
-            if(nrow(vcfCur) > 0){
545
+            if (nrow(vcfCur) > 0) {
550 546
 
551 547
                 # Get the positions to keep in resPileup (snpO)
552
-                tmpTime <- system.time({z <- cbind( c(vcfCur$start,
553
-                                                snpO$pos,
554
-                                                vcfCur$start),
555
-                                            c(rep(-1, nrow(vcfCur)),
556
-                                                rep(0, nrow(snpO)),
557
-                                                rep(1, nrow(vcfCur))),
558
-                                            c(seq_len(nrow(vcfCur)),
559
-                                                seq_len(nrow(snpO)),
560
-                                                seq_len(nrow(vcfCur)))
561
-                                            )
562
-                                z <- z[order(z[,1]),]
563
-                                listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0)})
564
-                if(verbose) {message("processPileupChrBin selected pos user ",
565
-                                round(tmpTime[1],3),
566
-                                " system ", round(tmpTime[2],3),
567
-                                " elapsed ", round(tmpTime[3],3))}
568
-
569
-                # print("Match ")
570
-                # summarize by possition with the for base
548
+                tmpTime <- system.time( {z <- cbind( c(vcfCur$start,
549
+                                                snpO$pos, vcfCur$start),
550
+                    c(rep(-1, nrow(vcfCur)), rep(0, nrow(snpO)), 
551
+                        rep(1, nrow(vcfCur))),
552
+                    c(seq_len(nrow(vcfCur)), seq_len(nrow(snpO)),
553
+                        seq_len(nrow(vcfCur))))
554
+                    z <- z[order(z[,1]),]
555
+                    listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0)} )
556
+                
557
+                if (verbose) {
558
+                    message("processPileupChrBin selected pos user ",
559
+                        round(tmpTime[1],3), " system ", round(tmpTime[2],3),
560
+                        " elapsed ", round(tmpTime[3],3)) }
561
+
562
+                # summarize by position with the for base
571 563
                 tmpTime <- system.time(resCur <- as.data.frame(snpO[z[listKeep, 3],] %>%
572
-                                                              group_by(.data$pos) %>%
573
-                                                              summarize(seqnames=.data$seqnames[1],
574
-                                                                        A=extractNucleotide(.data$nucleotide,.data$count, "A"),
575
-                                                                        C=extractNucleotide(.data$nucleotide,.data$count, "C"),
576
-                                                                        G=extractNucleotide(.data$nucleotide,.data$count, "G"),
577
-                                                                        T=extractNucleotide(.data$nucleotide,.data$count, "T"),
578
-                                                                        count=sum(.data$count))))
579
-                if(verbose) {message("processPileupChrBin extracted nucleotides user ",
580
-                                     round(tmpTime[1],3),
581
-                                     " system ", round(tmpTime[2],3),
582
-                                     " elapsed ", round(tmpTime[3],3))}
583
-                # print("Second Z")
564
+                    group_by(.data$pos) %>% 
565
+                        summarize(seqnames=.data$seqnames[1],
566
+                        A=extractNucleotide(.data$nucleotide,.data$count, "A"),
567
+                        C=extractNucleotide(.data$nucleotide,.data$count, "C"),
568
+                        G=extractNucleotide(.data$nucleotide,.data$count, "G"),
569
+                        T=extractNucleotide(.data$nucleotide,.data$count, "T"),
570
+                        count=sum(.data$count))))
571
+                
572
+                if (verbose) {
573
+                    message("processPileupChrBin extracted nucleotides user ",
574
+                        round(tmpTime[1],3), " system ", round(tmpTime[2],3),
575
+                        " elapsed ", round(tmpTime[3],3)) }
576
+                
584 577
                 # Add the reference allele and the alternative allele
585 578
                 tmpTime <- system.time({z <- cbind(c(resCur$pos, vcfCur$start, resCur$pos),
586
-                                    c(rep(-1,nrow(resCur)),rep(0, nrow(vcfCur)),rep(1, nrow(resCur))),
587
-                                    c(seq_len(nrow(resCur)), seq_len(nrow(vcfCur)), seq_len(nrow(resCur))))
588
-                                z <- z[order(z[,1]),]
589
-                                listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0)
590
-                                resCur$REF <- rep("N", nrow(resCur))
591
-                                resCur$ALT <- rep("N", nrow(resCur))
592
-                                resCur$REF[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "REF"]
593
-                                resCur$ALT[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "ALT"]
594
-                                resCur <- resCur[,c("seqnames", "pos", "REF", "ALT", "A", "C", "G", "T", "count")]})
595
-
596
-                if(verbose) {message("processPileupChrBin add ref and alt allele user ",
597
-                                     round(tmpTime[1],3),
598
-                                     " system ", round(tmpTime[2],3),
599
-                                     " elapsed ", round(tmpTime[3],3))}
579
+                    c(rep(-1, nrow(resCur)), rep(0, nrow(vcfCur)), rep(1, nrow(resCur))),
580
+                    c(seq_len(nrow(resCur)), seq_len(nrow(vcfCur)), seq_len(nrow(resCur))))
581
+                    z <- z[order(z[,1]),]
582
+                    listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0)
583
+                    resCur$REF <- rep("N", nrow(resCur))
584
+                    resCur$ALT <- rep("N", nrow(resCur))
585
+                    resCur$REF[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "REF"]
586
+                    resCur$ALT[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "ALT"]
587
+                    resCur <- resCur[,c("seqnames", "pos", "REF", "ALT", "A", 
588
+                                            "C", "G", "T", "count")]} )
589
+
590
+                if(verbose) {
591
+                    message("processPileupChrBin add ref and alt allele user ",
592
+                        round(tmpTime[1],3), " system ", round(tmpTime[2],3),
593
+                        " elapsed ", round(tmpTime[3],3)) }
600 594
             }
601 595
         }
602 596
     }
... ...
@@ -5,7 +5,7 @@
5 5
 \alias{processPileupChrBin}
6 6
 \title{Read a VCF file with the genotypes use for the ancestry call}
7 7
 \usage{
8
-processPileupChrBin(chr, resPileup, varDf, verbose = FALSE)
8
+processPileupChrBin(chr, resPileup, varDf, verbose)
9 9
 }
10 10
 \arguments{
11 11
 \item{chr}{a \code{character} string representing the name, including
... ...
@@ -14,37 +14,30 @@ The VCF must contain those genotype fields: GT, AD, DP.}
14 14
 
15 15
 \item{resPileup}{result from pileup}
16 16
 
17
-\item{varDf}{a \code{data.frame} representing the position to keep}
17
+\item{varDf}{a \code{data.frame} representing the positions to keep}
18 18
 
19
-\item{verbose}{a \code{logical} indicating if messages should be printed
20
-to show how the different steps in the function. Default: \code{FALSE}.}
19
+\item{verbose}{a \code{logical} indicating if messages should be printed}
21 20
 }
22 21
 \value{
23 22
 a \code{data.frame} containing at least:
24 23
 \describe{
25
-\item{seqnames}{ a  \code{character} representing the name of
26
-the chromosome}
27
-\item{pos}{ a \code{numeric} representing the position on the
28
-chromosome}
24
+\item{seqnames}{ a \code{character} representing the name of the chromosome}
25
+\item{pos}{ a \code{numeric} representing the position on the chromosome}
29 26
 \item{REF}{ a \code{character} string representing the reference nucleotide}
30 27
 \item{ALT}{ a \code{character} string representing the alternative
31 28
 nucleotide}
32
-\item{A}{ a \code{numeric} representing the count for
33
-the A nucleotide}
34
-\item{C}{ a \code{numeric} representing the count for
35
-the C nucleotide}
36
-\item{G}{ a \code{numeric} representing the count for
37
-the G nucleotide}
38
-\item{T}{ a \code{numeric} representing the count for
39
-the T nucleotide}
29
+\item{A}{ a \code{numeric} representing the count for the A nucleotide}
30
+\item{C}{ a \code{numeric} representing the count for the C nucleotide}
31
+\item{G}{ a \code{numeric} representing the count for the G nucleotide}
32
+\item{T}{ a \code{numeric} representing the count for the T nucleotide}
40 33
 \item{count}{ a \code{numeric} representing the total count}
41 34
 }
42 35
 }
43 36
 \description{
44 37
 The function reads VCF file and
45
-returns a data frame
38
+returns a \code{data.frame}
46 39
 containing the information about the read counts for the SNVs present in
47
-the file.
40
+the VCF.
48 41
 }
49 42
 \examples{
50 43