#' Calculate various per sample statistics from a VCF file #' #' Only biallelic SNPs are considered. #' For each sample, the following statistics are calculated : #' \itemize{ #' \item INDIV : Sample name #' \item N_SITES : Total number of SNPs #' \item N_HET : Number of SNPs with #' heterozygous call (\code{0/1} or \code{0|1} #' or \code{1/0} or \code{1|0}) #' \item N_ALT : Number of SNPs with #' alternate homozygous call (\code{1/1} or \code{1|1}) #' \item N_REF : Number of SNPs with #' reference homozygous call (\code{0/0} or \code{0|0}) #' \item N_MISS : Number of SNPs with #' missing call (\code{./.} or \code{.|.}) #' \item P_HET : Percentage of heterozygous calls #' \item P_ALT : Percentage of alternate homozygous calls #' \item P_REF : Percentage of reference homozygous calls #' \item P_MISS : Percentage of missing calls (missing rate) #' } #' #' @param inputFile Input vcf file location (uncompressed or gzip compressed). #' @param outputFile Output samples statistics file location. #' #' @return A \code{\link[base]{data.frame}} of sample statistics. #' @export #' #' @examples #' my.istats <- vcf2istats( #' inputFile = #' system.file("extdata", "samples.vcf.gz", package = "fastreeR") #' ) #' @author Anestis Gkanogiannis, \email{anestis@@gkanogiannis.com} #' @references Java implementation: #' \url{https://github.com/gkanogiannis/BioInfoJava-Utils} vcf2istats <- function(inputFile, outputFile = NULL) { vcf2istats_checkParams(inputFile = inputFile, outputFile = outputFile) bioinfojavautils <- rJava::.jnew( class="com/gkano/bioinfo/javautils/JavaUtils", class.loader = .rJava.class.loader ) cmd <- paste( "VCF2ISTATS", inputFile, sep = " " ) temp.out <- tempfile(fileext = ".txt") on.exit(unlink(temp.out)) jSys <- rJava::J("java/lang/System") jOrigOut <- jSys$out jSys$setOut(rJava::.jnew("java/io/PrintStream", temp.out)) bioinfojavautils$go(rJava::.jarray(strsplit(cmd, "\\s+")[[1]])) jSys$setOut(jOrigOut) ret.str <- stringr::str_replace_all(readLines(temp.out), "\t", " ") ret.df <- utils::read.table(text = ret.str, header = TRUE) if (!is.null(outputFile)) { data.table::fwrite(as.list(ret.str), file = outputFile, sep = "\n") } return(ret.df) } vcf2istats_checkParams <- function(inputFile, outputFile) { if (!methods::is(inputFile, "character")){ stop("inputFile must be a file location.") } if (is.null(inputFile) || !file.exists(inputFile)) { stop("inputFile=",inputFile," does not exist.") } if ((!is.null(outputFile) && !methods::is(outputFile, "character")) || (methods::is(outputFile, "character") && nchar(outputFile)==0)) { stop("outputFile must be a file location.") } }