... | ... |
@@ -478,42 +478,34 @@ extractNucleotide <- function(nucleotide, count, curNucleo) { |
478 | 478 |
} |
479 | 479 |
|
480 | 480 |
|
481 |
-#' @title Read a VCF file with the genotypes use for the ancestry call |
|
481 |
+#' @title Extract SNV information from pileup file for a selected chromosome |
|
482 | 482 |
#' |
483 |
-#' @description The function reads VCF file and |
|
484 |
-#' returns a data frame |
|
483 |
+#' @description The function reads pileup file and |
|
484 |
+#' returns a \code{data.frame} |
|
485 | 485 |
#' containing the information about the read counts for the SNVs present in |
486 |
-#' the file. |
|
487 |
-#' |
|
488 |
-#' @param chr a \code{character} string representing the name, including |
|
489 |
-#' the path, of a VCF file containing the SNV read counts. |
|
490 |
-#' The VCF must contain those genotype fields: GT, AD, DP. |
|
486 |
+#' the selected chromosome. |
|
491 | 487 |
#' |
488 |
+#' @param chr a \code{character} string representing the name of the |
|
489 |
+#' chromosome to keep |
|
492 | 490 |
#' |
493 |
-#' @param resPileup result from pileup |
|
491 |
+#' @param resPileup a \code{data.frame} as generated by the \code{pileup} |
|
492 |
+#' function from \code{Rsamtools} package |
|
494 | 493 |
#' |
495 |
-#' @param varDf a \code{data.frame} representing the position to keep |
|
494 |
+#' @param varDf a \code{data.frame} representing the positions to keep |
|
496 | 495 |
#' |
497 | 496 |
#' @param verbose a \code{logical} indicating if messages should be printed |
498 |
-#' to show how the different steps in the function. Default: \code{FALSE}. |
|
499 | 497 |
#' |
500 | 498 |
#' @return a \code{data.frame} containing at least: |
501 | 499 |
#' \describe{ |
502 |
-#' \item{seqnames}{ a \code{character} representing the name of |
|
503 |
-#' the chromosome} |
|
504 |
-#' \item{pos}{ a \code{numeric} representing the position on the |
|
505 |
-#' chromosome} |
|
500 |
+#' \item{seqnames}{ a \code{character} representing the name of the chromosome} |
|
501 |
+#' \item{pos}{ a \code{numeric} representing the position on the chromosome} |
|
506 | 502 |
#' \item{REF}{ a \code{character} string representing the reference nucleotide} |
507 | 503 |
#' \item{ALT}{ a \code{character} string representing the alternative |
508 | 504 |
#' nucleotide} |
509 |
-#' \item{A}{ a \code{numeric} representing the count for |
|
510 |
-#' the A nucleotide} |
|
511 |
-#' \item{C}{ a \code{numeric} representing the count for |
|
512 |
-#' the C nucleotide} |
|
513 |
-#' \item{G}{ a \code{numeric} representing the count for |
|
514 |
-#' the G nucleotide} |
|
515 |
-#' \item{T}{ a \code{numeric} representing the count for |
|
516 |
-#' the T nucleotide} |
|
505 |
+#' \item{A}{ a \code{numeric} representing the count for the A nucleotide} |
|
506 |
+#' \item{C}{ a \code{numeric} representing the count for the C nucleotide} |
|
507 |
+#' \item{G}{ a \code{numeric} representing the count for the G nucleotide} |
|
508 |
+#' \item{T}{ a \code{numeric} representing the count for the T nucleotide} |
|
517 | 509 |
#' \item{count}{ a \code{numeric} representing the total count} |
518 | 510 |
#' } |
519 | 511 |
#' |
... | ... |
@@ -528,16 +520,20 @@ extractNucleotide <- function(nucleotide, count, curNucleo) { |
528 | 520 |
#' @importFrom rlang .data |
529 | 521 |
#' @encoding UTF-8 |
530 | 522 |
#' @keywords internal |
531 |
-processPileupChrBin <- function(chr, |
|
532 |
- resPileup, varDf, |
|
533 |
- verbose=FALSE) { |
|
523 |
+processPileupChrBin <- function(chr, resPileup, varDf, verbose) { |
|
534 | 524 |
|
535 | 525 |
resCur <- NULL |
536 |
- if(chr %in% names(varDf)){ |
|
526 |
+ |
|
527 |
+ ## Assign FALSE to verbose by default |
|
528 |
+ if (is.null(verbose)) { |
|
529 |
+ verbose <- FALSE |
|
530 |
+ } |
|
531 |
+ |
|
532 |
+ if (chr %in% names(varDf)) { |
|
537 | 533 |
|
538 | 534 |
keep <- which(resPileup$seqnames == chr) |
539 | 535 |
|
540 |
- if(length(keep) > 0){ |
|
536 |
+ if (length(keep) > 0) { |
|
541 | 537 |
# restrict the resPileup to the chromosome chr |
542 | 538 |
snpO <- resPileup[keep,] |
543 | 539 |
rm(keep) |
... | ... |
@@ -546,57 +542,55 @@ processPileupChrBin <- function(chr, |
546 | 542 |
vcfCur <- varDf[[chr]][varDf[[chr]]$start >= min(snpO$pos) & |
547 | 543 |
varDf[[chr]]$start <= max(snpO$pos),] |
548 | 544 |
|
549 |
- if(nrow(vcfCur) > 0){ |
|
545 |
+ if (nrow(vcfCur) > 0) { |
|
550 | 546 |
|
551 | 547 |
# Get the positions to keep in resPileup (snpO) |
552 |
- tmpTime <- system.time({z <- cbind( c(vcfCur$start, |
|
553 |
- snpO$pos, |
|
554 |
- vcfCur$start), |
|
555 |
- c(rep(-1, nrow(vcfCur)), |
|
556 |
- rep(0, nrow(snpO)), |
|
557 |
- rep(1, nrow(vcfCur))), |
|
558 |
- c(seq_len(nrow(vcfCur)), |
|
559 |
- seq_len(nrow(snpO)), |
|
560 |
- seq_len(nrow(vcfCur))) |
|
561 |
- ) |
|
562 |
- z <- z[order(z[,1]),] |
|
563 |
- listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0)}) |
|
564 |
- if(verbose) {message("processPileupChrBin selected pos user ", |
|
565 |
- round(tmpTime[1],3), |
|
566 |
- " system ", round(tmpTime[2],3), |
|
567 |
- " elapsed ", round(tmpTime[3],3))} |
|
568 |
- |
|
569 |
- # print("Match ") |
|
570 |
- # summarize by possition with the for base |
|
548 |
+ tmpTime <- system.time( {z <- cbind( c(vcfCur$start, |
|
549 |
+ snpO$pos, vcfCur$start), |
|
550 |
+ c(rep(-1, nrow(vcfCur)), rep(0, nrow(snpO)), |
|
551 |
+ rep(1, nrow(vcfCur))), |
|
552 |
+ c(seq_len(nrow(vcfCur)), seq_len(nrow(snpO)), |
|
553 |
+ seq_len(nrow(vcfCur)))) |
|
554 |
+ z <- z[order(z[,1]),] |
|
555 |
+ listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0)} ) |
|
556 |
+ |
|
557 |
+ if (verbose) { |
|
558 |
+ message("processPileupChrBin selected pos user ", |
|
559 |
+ round(tmpTime[1],3), " system ", round(tmpTime[2],3), |
|
560 |
+ " elapsed ", round(tmpTime[3],3)) } |
|
561 |
+ |
|
562 |
+ # summarize by position with the for base |
|
571 | 563 |
tmpTime <- system.time(resCur <- as.data.frame(snpO[z[listKeep, 3],] %>% |
572 |
- group_by(.data$pos) %>% |
|
573 |
- summarize(seqnames=.data$seqnames[1], |
|
574 |
- A=extractNucleotide(.data$nucleotide,.data$count, "A"), |
|
575 |
- C=extractNucleotide(.data$nucleotide,.data$count, "C"), |
|
576 |
- G=extractNucleotide(.data$nucleotide,.data$count, "G"), |
|
577 |
- T=extractNucleotide(.data$nucleotide,.data$count, "T"), |
|
578 |
- count=sum(.data$count)))) |
|
579 |
- if(verbose) {message("processPileupChrBin extracted nucleotides user ", |
|
580 |
- round(tmpTime[1],3), |
|
581 |
- " system ", round(tmpTime[2],3), |
|
582 |
- " elapsed ", round(tmpTime[3],3))} |
|
583 |
- # print("Second Z") |
|
564 |
+ group_by(.data$pos) %>% |
|
565 |
+ summarize(seqnames=.data$seqnames[1], |
|
566 |
+ A=extractNucleotide(.data$nucleotide,.data$count, "A"), |
|
567 |
+ C=extractNucleotide(.data$nucleotide,.data$count, "C"), |
|
568 |
+ G=extractNucleotide(.data$nucleotide,.data$count, "G"), |
|
569 |
+ T=extractNucleotide(.data$nucleotide,.data$count, "T"), |
|
570 |
+ count=sum(.data$count)))) |
|
571 |
+ |
|
572 |
+ if (verbose) { |
|
573 |
+ message("processPileupChrBin extracted nucleotides user ", |
|
574 |
+ round(tmpTime[1],3), " system ", round(tmpTime[2],3), |
|
575 |
+ " elapsed ", round(tmpTime[3],3)) } |
|
576 |
+ |
|
584 | 577 |
# Add the reference allele and the alternative allele |
585 | 578 |
tmpTime <- system.time({z <- cbind(c(resCur$pos, vcfCur$start, resCur$pos), |
586 |
- c(rep(-1,nrow(resCur)),rep(0, nrow(vcfCur)),rep(1, nrow(resCur))), |
|
587 |
- c(seq_len(nrow(resCur)), seq_len(nrow(vcfCur)), seq_len(nrow(resCur)))) |
|
588 |
- z <- z[order(z[,1]),] |
|
589 |
- listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0) |
|
590 |
- resCur$REF <- rep("N", nrow(resCur)) |
|
591 |
- resCur$ALT <- rep("N", nrow(resCur)) |
|
592 |
- resCur$REF[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "REF"] |
|
593 |
- resCur$ALT[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "ALT"] |
|
594 |
- resCur <- resCur[,c("seqnames", "pos", "REF", "ALT", "A", "C", "G", "T", "count")]}) |
|
595 |
- |
|
596 |
- if(verbose) {message("processPileupChrBin add ref and alt allele user ", |
|
597 |
- round(tmpTime[1],3), |
|
598 |
- " system ", round(tmpTime[2],3), |
|
599 |
- " elapsed ", round(tmpTime[3],3))} |
|
579 |
+ c(rep(-1, nrow(resCur)), rep(0, nrow(vcfCur)), rep(1, nrow(resCur))), |
|
580 |
+ c(seq_len(nrow(resCur)), seq_len(nrow(vcfCur)), seq_len(nrow(resCur)))) |
|
581 |
+ z <- z[order(z[,1]),] |
|
582 |
+ listKeep <- which(cumsum(z[,2]) < 0 & z[,2]==0) |
|
583 |
+ resCur$REF <- rep("N", nrow(resCur)) |
|
584 |
+ resCur$ALT <- rep("N", nrow(resCur)) |
|
585 |
+ resCur$REF[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "REF"] |
|
586 |
+ resCur$ALT[z[listKeep-1,3]] <- vcfCur[z[listKeep,3], "ALT"] |
|
587 |
+ resCur <- resCur[,c("seqnames", "pos", "REF", "ALT", "A", |
|
588 |
+ "C", "G", "T", "count")]} ) |
|
589 |
+ |
|
590 |
+ if(verbose) { |
|
591 |
+ message("processPileupChrBin add ref and alt allele user ", |
|
592 |
+ round(tmpTime[1],3), " system ", round(tmpTime[2],3), |
|
593 |
+ " elapsed ", round(tmpTime[3],3)) } |
|
600 | 594 |
} |
601 | 595 |
} |
602 | 596 |
} |
... | ... |
@@ -5,7 +5,7 @@ |
5 | 5 |
\alias{processPileupChrBin} |
6 | 6 |
\title{Read a VCF file with the genotypes use for the ancestry call} |
7 | 7 |
\usage{ |
8 |
-processPileupChrBin(chr, resPileup, varDf, verbose = FALSE) |
|
8 |
+processPileupChrBin(chr, resPileup, varDf, verbose) |
|
9 | 9 |
} |
10 | 10 |
\arguments{ |
11 | 11 |
\item{chr}{a \code{character} string representing the name, including |
... | ... |
@@ -14,37 +14,30 @@ The VCF must contain those genotype fields: GT, AD, DP.} |
14 | 14 |
|
15 | 15 |
\item{resPileup}{result from pileup} |
16 | 16 |
|
17 |
-\item{varDf}{a \code{data.frame} representing the position to keep} |
|
17 |
+\item{varDf}{a \code{data.frame} representing the positions to keep} |
|
18 | 18 |
|
19 |
-\item{verbose}{a \code{logical} indicating if messages should be printed |
|
20 |
-to show how the different steps in the function. Default: \code{FALSE}.} |
|
19 |
+\item{verbose}{a \code{logical} indicating if messages should be printed} |
|
21 | 20 |
} |
22 | 21 |
\value{ |
23 | 22 |
a \code{data.frame} containing at least: |
24 | 23 |
\describe{ |
25 |
-\item{seqnames}{ a \code{character} representing the name of |
|
26 |
-the chromosome} |
|
27 |
-\item{pos}{ a \code{numeric} representing the position on the |
|
28 |
-chromosome} |
|
24 |
+\item{seqnames}{ a \code{character} representing the name of the chromosome} |
|
25 |
+\item{pos}{ a \code{numeric} representing the position on the chromosome} |
|
29 | 26 |
\item{REF}{ a \code{character} string representing the reference nucleotide} |
30 | 27 |
\item{ALT}{ a \code{character} string representing the alternative |
31 | 28 |
nucleotide} |
32 |
-\item{A}{ a \code{numeric} representing the count for |
|
33 |
-the A nucleotide} |
|
34 |
-\item{C}{ a \code{numeric} representing the count for |
|
35 |
-the C nucleotide} |
|
36 |
-\item{G}{ a \code{numeric} representing the count for |
|
37 |
-the G nucleotide} |
|
38 |
-\item{T}{ a \code{numeric} representing the count for |
|
39 |
-the T nucleotide} |
|
29 |
+\item{A}{ a \code{numeric} representing the count for the A nucleotide} |
|
30 |
+\item{C}{ a \code{numeric} representing the count for the C nucleotide} |
|
31 |
+\item{G}{ a \code{numeric} representing the count for the G nucleotide} |
|
32 |
+\item{T}{ a \code{numeric} representing the count for the T nucleotide} |
|
40 | 33 |
\item{count}{ a \code{numeric} representing the total count} |
41 | 34 |
} |
42 | 35 |
} |
43 | 36 |
\description{ |
44 | 37 |
The function reads VCF file and |
45 |
-returns a data frame |
|
38 |
+returns a \code{data.frame} |
|
46 | 39 |
containing the information about the read counts for the SNVs present in |
47 |
-the file. |
|
40 |
+the VCF. |
|
48 | 41 |
} |
49 | 42 |
\examples{ |
50 | 43 |
|