Bioconductor Code: RAIDS

R/gdsWrapper_internal.R

dd76ad8e	#' @title Initialization of the section related to the profile
be6ebeca	#' information in the GDS file #'
dd76ad8e	#' @description This function initializes the section related to the profile
be6ebeca	#' information in the GDS file. The information is extracted from #' the \code{data.frame} passed to the function. The nodes "sample.id" and #' "sample.annot" are created in the GDS file. #'
48837112	#' @param gdsReference an object of class
be6ebeca	#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file. #'
32e75790	#' @param dfPedReference a \code{data.frame} containing the information #' related to the
be6ebeca	#' samples. It must have those columns: "sample.id", "Name.ID", "sex", #' "pop.group", "superPop" and "batch". All columns, except "sex" and batch", #' are \code{character} strings. The "batch" and "sex" columns are #' \code{integer}. The unique identifier #' of this \code{data.frame} is the "Name.ID" column. The row names of the #' \code{data.frame} must correspond to the identifiers present in the #' "Name.ID" column. #' #' @param listSamples a \code{vector} of \code{character} string representing
dd76ad8e	#' the identifiers of the selected profiles. If \code{NULL}, all profiles are
be6ebeca	#' selected. Default: \code{NULL}. #' #' @return a \code{vector} of \code{character} string with the identifiers of
dd76ad8e	#' the profiles saved in the GDS file.
be6ebeca	#' #' @examples #'
1c304a2a	#' ## Required library #' library(gdsfmt) #'
4cf4bc51	#' ## Temporary GDS file in current directory
045e8817	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_10.gds")
be6ebeca	#'
045e8817	#' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath)
be6ebeca	#'
045e8817	#' ## Create "sample.annot" node (the node must be present) #' pedInformation <- data.frame(sample.id=c("sample_01", "sample_02"),
0fdab8b8	#' Name.ID=c("sample_01", "sample_02"), #' sex=c(1,1), # 1:Male 2: Female #' pop.group=c("ACB", "ACB"), #' superPop=c("AFR", "AFR"), #' batch=c(1, 1), #' stringsAsFactors=FALSE)
be6ebeca	#'
045e8817	#' ## The row names must be the sample identifiers #' rownames(pedInformation) <- pedInformation$Name.ID
be6ebeca	#'
045e8817	#' ## Add information about 2 samples to the GDS file #' RAIDS:::generateGDSRefSample(gdsReference=tmpGDS,
0fdab8b8	#' dfPedReference=pedInformation, listSamples=NULL)
be6ebeca	#'
045e8817	#' ## Read sample identifier list #' read.gdsn(index.gdsn(node=tmpGDS, path="sample.id"))
be6ebeca	#'
045e8817	#' ## Read sample information from GDS file #' read.gdsn(index.gdsn(node=tmpGDS, path="sample.annot"))
be6ebeca	#'
045e8817	#' ## Close GDS file #' closefn.gds(gdsfile=tmpGDS)
0fdab8b8	#'
045e8817	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
0fdab8b8	#'
be6ebeca	#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn #' @encoding UTF-8 #' @keywords internal
32e75790	generateGDSRefSample <- function(gdsReference, dfPedReference, listSamples=NULL) {
be6ebeca	if(!(is.null(listSamples))){
b6cc5ca4	dfPedReference <- dfPedReference[listSamples,]
be6ebeca	}
32e75790	add.gdsn(node=gdsReference, name="sample.id", val=dfPedReference[, "Name.ID"])
be6ebeca	## Create a data.frame containing the information form the samples
b6cc5ca4	samp.annot <- data.frame(sex=dfPedReference[, "sex"],
32e75790	pop.group=dfPedReference[, "pop.group"], superPop=dfPedReference[, "superPop"],
b6cc5ca4	batch=dfPedReference[, "batch"], stringsAsFactors=FALSE)
be6ebeca	## Add the data.frame to the GDS object
48837112	add.gdsn(node=gdsReference, name="sample.annot", val=samp.annot)
be6ebeca	## Return the vector of saved samples
b6cc5ca4	return(dfPedReference[, "Name.ID"])
be6ebeca	} #' @title Create a "sample.ref" node i a GDS file with the information about #' the related/unrelated state of the reference samples #' #' @description This function creates a "sample.ref" node in the GDS file. #' The node contains a vector of integers with value of 1 when #' the samples are used as references and 0 otherwise. #' The information used to fill the "sample.ref" node comes from the RDS file #' that contains the information about the unrelated reference samples. #'
aa3f0a99	#' @param gdsReference an object of class
be6ebeca	#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file. #' #' @param filePart a \code{character} string representing the path and file #' name of a RDS file containing the information about the related and #' unrelated samples in the reference dataset. The RDS file must exist. The #' RDS file must contains a \code{vector} of \code{character} strings called #' "unrels" with the name of the unrelated samples. #' #' @return The integer \code{0L} when successful. #' #' @examples #'
1c304a2a	#' ## Required library #' library(gdsfmt) #'
3e69f2d1	#' ## Locate RDS with unrelated/related status for Reference samples
bdf3a5d8	#' dataDir <- system.file("extdata", package="RAIDS") #' rdsFilePath <- file.path(dataDir, "unrelatedPatientsInfo_Demo.rds")
be6ebeca	#'
1434071b	#' ## Temporary GDS file
045e8817	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_11.gds")
0fdab8b8	#'
045e8817	#' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath)
be6ebeca	#
045e8817	#' ## Create "sample.id" node (the node must be present) #' sampleIDs <- c("HG00104", "HG00109", "HG00110") #' add.gdsn(node=tmpGDS, name="sample.id", val=sampleIDs)
0fdab8b8	#'
045e8817	#' ## Create "sample.ref" node in GDS file using RDS information #' RAIDS:::addGDSRef(gdsReference=tmpGDS, filePart=rdsFilePath)
be6ebeca	#'
045e8817	#' ## Read sample reference data.frame #' read.gdsn(index.gdsn(node=tmpGDS, path="sample.ref"))
be6ebeca	#'
045e8817	#' ## Close GDS file #' closefn.gds(gdsfile=tmpGDS)
be6ebeca	#'
045e8817	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
be6ebeca	#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn #' @encoding UTF-8 #' @keywords internal
aa3f0a99	addGDSRef <- function(gdsReference, filePart) {
be6ebeca	part <- readRDS(filePart)
aa3f0a99	sampleGDS <- index.gdsn(gdsReference, "sample.id")
be6ebeca	df <- data.frame(sample.id=read.gdsn(sampleGDS), sample.ref=0, stringsAsFactors=FALSE) # The order of part$unrels is not the same than df$sample.id df[df$sample.id %in% part$unrels, "sample.ref"] <- 1
aa3f0a99	add.gdsn(gdsReference, "sample.ref", df$sample.ref, storage="bit1")
be6ebeca	## Success return(0L) }
dd76ad8e	#' @title Appends the genotype information for specific samples #' (1 column == 1 profile) into a GDS file
be6ebeca	#' #' @description This function appends the genotype information into a #' GDS file. More specifically, the genotype information is added to the #' "genotype" node. The "genotype" node must already be present in the #' GDS file. The genotype information is a matrix with the rows corresponding #' to SNVs and columns corresponding to samples. #' The number of rows of the new genotype information must #' correspond to the number of rows of the matrix present in the #' "genotype" node. #' #' @param gds an object of class
dd76ad8e	#' \link[gdsfmt]{gds.class} (a GDS file), the opened Profile GDS file.
be6ebeca	#' #' @param matG a \code{matrix} of \code{integer} representing the genotypes #' of the SNVs for one or multiple samples. The rows correspond to SNVs and #' the columns correspond to samples. The number of rows must #' correspond to the number of rows of the matrix present in the #' "genotype" node. #' #' @return The integer \code{0L} when successful. #' #' @examples #'
1c304a2a	#' ## Required library #' library(gdsfmt) #'
1434071b	#' ## Create a temporary GDS file
5f024f9e	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_06.gds")
be6ebeca	#'
5f024f9e	#' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath)
be6ebeca	#'
5f024f9e	#' ## Create a "genotype" node with initial matrix #' genoInitial <- matrix(rep(0L, 10), nrow=2)
be6ebeca	#'
5f024f9e	#' add.gdsn(node=tmpGDS, name="genotype", val=genoInitial) #' sync.gds(tmpGDS)
be6ebeca	#'
5f024f9e	#' ## New genotype information to be added #' newGenotype <- matrix(rep(1L, 6), nrow=2)
be6ebeca	#'
5f024f9e	#' ## Add segments to the GDS file #' RAIDS:::appendGDSgenotypeMat(gds=tmpGDS, matG=newGenotype)
be6ebeca	#'
5f024f9e	#' ## Read genotype information from GDS file #' ## The return matrix should be a combination of both initial matrix #' ## and new matrix (column binded) #' read.gdsn(index.gdsn(node=tmpGDS, path="genotype"))
be6ebeca	#'
5f024f9e	#' ## Close GDS file #' closefn.gds(gdsfile=tmpGDS)
1434071b	#'
5f024f9e	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
1434071b	#'
be6ebeca	#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn #' @importFrom utils read.csv2 #' @encoding UTF-8 #' @keywords internal appendGDSgenotypeMat <- function(gds, matG) { geno.var <- index.gdsn(gds, "genotype") append.gdsn(node=geno.var, val=matG, check=TRUE) return(0L) }
e8cb5540	#' @title Append the genotype information from a profile into the associated
9609f383	#' Profile GDS File
be6ebeca	#'
e8cb5540	#' @description This function append the genotype information from a specific #' profile into the Profile GDS file. The genotype information is extracted #' from a SNV file as generated by SNP-pileup or other tools.
be6ebeca	#'
87c14221	#' @param pathGeno a \code{character} string representing the path to the #' directory containing the VCF output of SNP-pileup for each sample. The #' SNP-pileup files must be compressed (gz files) and have the name identifiers #' of the samples. A sample with "Name.ID" identifier would have an #' associated file called #' if genoSource is "VCF", then "Name.ID.vcf.gz", #' if genoSource is "generic", then "Name.ID.generic.txt.gz" #' if genoSource is "snp-pileup", then "Name.ID.txt.gz".
be6ebeca	#' #' @param listSamples a \code{vector} of \code{character} string corresponding
dd76ad8e	#' to the sample identifiers that will have a Profile GDS file created. The
be6ebeca	#' sample identifiers must be present in the "Name.ID" column of the
b6cc5ca4	#' \code{data.frame} passed to the \code{dfPedProfile} parameter.
be6ebeca	#'
e8cb5540	#' @param listPos a \code{data.frame} containing 2 columns. The first column, #' called "snp.chromosome" contains the name of the chromosome where the #' SNV is located. The second column, called "snp.position" contains the #' position of the SNV on the chromosome.
be6ebeca	#'
e8cb5540	#' @param offset a \code{integer} to adjust if the genome start at 0 or 1.
be6ebeca	#'
e8cb5540	#' @param minCov a single positive \code{integer} representing the minimum #' coverage needed to keep the SNVs in the analysis. Default: \code{10}.
be6ebeca	#'
e8cb5540	#' @param minProb a single positive \code{numeric} between 0 and 1 #' representing the probability that the base change at the SNV position #' is not an error. #' Default: \code{0.999}.
be6ebeca	#' #' @param seqError a single positive \code{numeric} between 0 and 1 #' representing the sequencing error rate. Default: \code{0.001}. #'
32e75790	#' @param dfPedProfile a \code{data.frame} with the information about #' the sample(s).
be6ebeca	#' Those are mandatory columns: "Name.ID",
040f95da	#' "Case.ID", "Sample.Type", "Diagnosis" and "Source". All columns must be in #' \code{character} strings format. The \code{data.frame}
be6ebeca	#' must contain the information for all the samples passed in the #' \code{listSamples} parameter. #' #' @param batch a single positive \code{integer} representing the current #' identifier for the batch. Beware, this field is not stored anymore. #' #' @param studyDF a \code{data.frame} containing the information about the #' study associated to the analysed sample(s). The \code{data.frame} must have #' those 3 columns: "study.id", "study.desc", "study.platform". All columns #' must be in \code{character} strings. #'
e8cb5540	#' @param pathProfileGDS a \code{character} string representing the path to #' the directory where the GDS Sample files will be created.
ebecc683	#'
e8cb5540	#' @param genoSource a \code{character} string with two possible values:
9ed5a917	#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files
e8cb5540	#' are generated by snp-pileup (Facets) or are a generic format CSV file #' with at least those columns: #' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. #' The 'Count' is the depth at the specified position; #' 'FileR' is the depth of the reference allele and #' 'File1A' is the depth of the specific alternative allele.
9ed5a917	#' Finally the file can be a VCF file with at least those genotype
279ad515	#' fields: GT, AD, DP.
ebecc683	#'
be6ebeca	#' @param verbose a \code{logical} indicating if the function must print #' messages when running. #' #' @return The function returns \code{0L} when successful. #' #' @examples #'
7b9d4130	#' ## Current directory
5f024f9e	#' dataDir <- file.path(tempdir())
7b9d4130	#'
5f024f9e	#' ## Copy required file into current directory #' file.copy(from=file.path(system.file("extdata/tests", package="RAIDS"),
7b9d4130	#' "ex1.txt.gz"), to=dataDir) #'
5f024f9e	#' ## The data.frame containing the information about the study #' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" #' ## The entries should be strings, not factors (stringsAsFactors=FALSE) #' studyDF <- data.frame(study.id = "MYDATA",
e8cb5540	#' study.desc = "Description", #' study.platform = "PLATFORM", #' stringsAsFactors = FALSE) #'
5f024f9e	#' ## The data.frame containing the information about the samples #' ## The entries should be strings, not factors (stringsAsFactors=FALSE) #' samplePED <- data.frame(Name.ID=c("ex1", "ex2"),
e8cb5540	#' Case.ID=c("Patient_h11", "Patient_h12"), #' Diagnosis=rep("Cancer", 2), #' Sample.Type=rep("Primary Tumor", 2), #' Source=rep("Databank B", 2), stringsAsFactors=FALSE)
5f024f9e	#' rownames(samplePED) <- samplePED$Name.ID
e8cb5540	#'
5f024f9e	#' ## List of SNV positions #' listPositions <- data.frame(snp.chromosome=c(rep(1, 10)),
7b9d4130	#' snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, #' 3469737, 3471497, 3471565, 3471618))
e8cb5540	#'
5f024f9e	#' ## Append genotype information to the Profile GDS file #' result <- RAIDS:::generateGDS1KGgenotypeFromSNPPileup(pathGeno=dataDir,
e8cb5540	#' listSamples=c("ex1"), listPos=listPositions, #' offset=-1, minCov=10, minProb=0.999, seqError=0.001,
b6cc5ca4	#' dfPedProfile=samplePED, batch=1, studyDF=studyDF,
e8cb5540	#' pathProfileGDS=dataDir, genoSource="snp-pileup", #' verbose=FALSE) #'
5f024f9e	#' ## The function returns OL when successful #' result
7b9d4130	#'
5f024f9e	#' ## The Profile GDS file 'ex1.gds' has been created in the #' ## specified directory #' list.files(dataDir)
e8cb5540	#'
5f024f9e	#' ## Unlink Profile GDS file (created for demo purpose) #' unlink(file.path(dataDir, "ex1.gds")) #' unlink(file.path(dataDir, "ex1.txt.gz"))
e8cb5540	#'
be6ebeca	#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn write.gdsn openfn.gds #' @importFrom stats qbinom #' @importFrom utils read.csv #' @encoding UTF-8 #' @keywords internal generateGDS1KGgenotypeFromSNPPileup <- function(pathGeno, listSamples, listPos, offset, minCov=10, minProb=0.999,
b6cc5ca4	seqError=0.001, dfPedProfile, batch, studyDF, pathProfileGDS,
e8cb5540	genoSource, verbose) {
be6ebeca	# File with the description of the SNP keep
279ad515	if(genoSource == "VCF"){ listMat <- dir(pathGeno, pattern = ".+.vcf.gz") listSampleFile <- gsub(".vcf.gz", "", listMat) }else{ listMat <- dir(pathGeno, pattern = ".+.txt.gz") listSampleFile <- gsub(".txt.gz", "", listMat) }
ebecc683
be6ebeca	g <- as.matrix(rep(-1, nrow(listPos))) for(i in seq_len(length(listSamples))) { pos <- which(listSampleFile == listSamples[i]) if(verbose) { message(listSamples[i], " ", Sys.time(), " ", i) } if(length(pos) == 1) {
a597d143	#
ebecc683	if(genoSource == "snp-pileup") { matSample <- readSNVPileupFile(file.path(pathGeno, listMat[pos]), offset)
10c6487c	} else if(genoSource == "generic") {
ebecc683	matSample <- readSNVFileGeneric(file.path(pathGeno, listMat[pos]), offset)
279ad515	} else if(genoSource == "VCF") { tmpProfile <- gsub(".vcf.gz", "",listMat[pos]) matSample <- readSNVVCF(file.path(pathGeno, listMat[pos]), profileName=tmpProfile, offset)
a597d143	}
be6ebeca	# matAll <- merge(matSample[,c( "Chromosome", "Position", # "File1R", "File1A", # "count" )], # listPos, # by.x = c("Chromosome", "Position"), # by.y = c("snp.chromosome", "snp.position"), # all.y = TRUE, # all.x = FALSE) # # below same as the merge above but faster z <- cbind(c(listPos$snp.chromosome, matSample$Chromosome, matSample$Chromosome), c(listPos$snp.position, matSample$Position, matSample$Position), c(rep(1,nrow(listPos)), rep(0,nrow(matSample)), rep(2,nrow(matSample))), c(rep(0,nrow(listPos)), matSample[, "File1R"], -1 * matSample[, "File1R"]), c(rep(0,nrow(listPos)), matSample[, "File1A"], -1 * matSample[, "File1A"]), c(rep(0,nrow(listPos)), matSample[, "count"], -1 * matSample[, "count"])) rm(matSample) z <- z[order(z[,1], z[,2], z[,3]),] matAll <- data.frame(Chromosome=z[z[, 3] == 1, 1], Position=z[z[, 3] == 1, 2], File1R=cumsum(z[, 4])[z[, 3] == 1], File1A=cumsum(z[,5])[z[, 3] == 1], count=cumsum(z[, 6])[z[, 3] == 1]) rm(z)
e8cb5540	if(is.null(pathProfileGDS)){ stop("pathProfileGDS is NULL in ",
be6ebeca	"generateGDS1KGgenotypeFromSNPPileup\n") } else{
e8cb5540	if(! dir.exists(pathProfileGDS)) { dir.create(pathProfileGDS)
be6ebeca	} }
e8cb5540	fileGDSSample <- file.path(pathProfileGDS,
be6ebeca	paste0(listSamples[i], ".gds")) if(file.exists(fileGDSSample)) { gdsSample <- openfn.gds(fileGDSSample, readonly=FALSE) } else{ gdsSample <- createfn.gds(fileGDSSample) } if (! "Ref.count" %in% ls.gdsn(gdsSample)) { var.Ref <- add.gdsn(gdsSample, "Ref.count", matAll$File1R, valdim=c( nrow(listPos), 1), storage="sp.int16") var.Alt <- add.gdsn(gdsSample, "Alt.count", matAll$File1A, valdim=c( nrow(listPos), 1), storage="sp.int16") var.Count <- add.gdsn(gdsSample, "Total.count", matAll$count, valdim=c( nrow(listPos), 1), storage="sp.int16") } else { # you must append var.Ref <- append.gdsn(index.gdsn(gdsSample, "Ref.count"), matAll$File1R) var.Alt <- append.gdsn(index.gdsn(gdsSample, "Alt.count"), matAll$File1A) var.Count <- append.gdsn(index.gdsn(gdsSample, "Total.count"), matAll$count) }
3104ba2c	listSampleGDS <- addStudyGDSSample(gdsSample, pedProfile=dfPedProfile, batch=batch, listSamples=c(listSamples[i]), studyDF=studyDF, verbose=verbose)
be6ebeca	listCount <- table(matAll$count[matAll$count >= minCov]) cutOffA <- data.frame(count=unlist(vapply(as.integer(names(listCount)), FUN=function(x, minProb, eProb){
e8cb5540	return(max(2,qbinom(minProb, x, eProb))) },
be6ebeca	FUN.VALUE=numeric(1), minProb=minProb, eProb=2 * seqError)), allele=unlist(vapply(as.integer(names(listCount)), FUN=function(x, minProb, eProb){
e8cb5540	return(max(2,qbinom(minProb, x, eProb))) },
be6ebeca	FUN.VALUE=numeric(1), minProb=minProb, eProb=seqError))) row.names(cutOffA) <- names(listCount) # Initialize the genotype array at -1 # Select the position where the coverage of the 2 alleles is enough listCov <- which(rowSums(matAll[, c("File1R", "File1A")]) >= minCov) matAllC <- matAll[listCov,] # The difference depth - (nb Ref + nb Alt) can be realistically # explain by sequencing error listCov <- listCov[(matAllC$count - (matAllC$File1R + matAllC$File1A)) < cutOffA[as.character(matAllC$count), "count"]] matAllC <- matAll[listCov,] rm(matAll) g <- as.matrix(rep(-1, nrow(listPos))) # The sample is homozygote if the other known allele have a # coverage of 0 g[listCov][which(matAllC$File1A == 0)] <- 0 g[listCov][which(matAllC$File1R == 0)] <- 2 # The sample is heterozygote if explain the coverage of # the minor allele by sequencing error is not realistic. g[listCov][which(matAllC$File1A >= cutOffA[as.character(matAllC$count), "allele"] & matAllC$File1R >= cutOffA[as.character(matAllC$count), "allele"])] <- 1 #g <- as.matrix(g) if("geno.ref" %in% ls.gdsn(gdsSample)){ var.geno <- index.gdsn(gdsSample, "geno.ref") compression.gdsn(var.geno, compress="")
e8cb5540	append.gdsn(var.geno, g, check=TRUE)
be6ebeca	compression.gdsn(var.geno, compress="LZMA_RA.fast") readmode.gdsn(var.geno) }else{ var.geno <- add.gdsn(gdsSample, "geno.ref", valdim=c(length(g), 1), g, storage="bit2", compress = "LZMA_RA.fast") readmode.gdsn(var.geno) } rm(g) closefn.gds(gdsfile=gdsSample) if (verbose) { message(listMat[pos], " ", i, " ", Sys.time()) } } else { stop("Missing samples ", listSamples[i]) } } ## Success return(0L) }
dd2ee042	#' @title Append the genotype information from a profile into the associated #' Profile GDS File #' #' @description This function append the genotype information from a specific #' profile into the Profile GDS file. The genotype information is extracted #' from a SNV file as generated by SNP-pileup or other tools. #' #' @param profileFile a \code{character} string representing the path and the #' file name of the genotype file or the bam if genoSource is snp-pileup the #' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz #' #' @param profileName a \code{character} string representing the profileName #' #' @param listPos a \code{data.frame} containing 2 columns. The first column, #' called "snp.chromosome" contains the name of the chromosome where the #' SNV is located. The second column, called "snp.position" contains the #' position of the SNV on the chromosome. #' #' @param offset a \code{integer} to adjust if the genome start at 0 or 1. #' #' @param minCov a single positive \code{integer} representing the minimum #' coverage needed to keep the SNVs in the analysis. Default: \code{10}. #' #' @param minProb a single positive \code{numeric} between 0 and 1 #' representing the probability that the base change at the SNV position #' is not an error. #' Default: \code{0.999}. #' #' @param seqError a single positive \code{numeric} between 0 and 1 #' representing the sequencing error rate. Default: \code{0.001}. #' #' @param dfPedProfile a \code{data.frame} with the information about #' the sample(s). #' Those are mandatory columns: "Name.ID", #' "Case.ID", "Sample.Type", "Diagnosis" and "Source". All columns must be in #' \code{character} strings format. The \code{data.frame} #' must contain the information for all the samples passed in the #' \code{listSamples} parameter. #' #' @param batch a single positive \code{integer} representing the current #' identifier for the batch. Beware, this field is not stored anymore. #' #' @param studyDF a \code{data.frame} containing the information about the #' study associated to the analysed sample(s). The \code{data.frame} must have #' those 3 columns: "study.id", "study.desc", "study.platform". All columns #' must be in \code{character} strings. #' #' @param pathProfileGDS a \code{character} string representing the path to #' the directory where the GDS Sample files will be created. #' #' @param genoSource a \code{character} string with two possible values: #' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files #' are generated by snp-pileup (Facets) or are a generic format CSV file #' with at least those columns: #' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'. #' The 'Count' is the depth at the specified position; #' 'FileR' is the depth of the reference allele and #' 'File1A' is the depth of the specific alternative allele. #' Finally the file can be a VCF file with at least those genotype #' fields: GT, AD, DP. #'
9969cfc7	#' @param paramProfileGDS a \code{list} parameters ... #'
dd2ee042	#' @param verbose a \code{logical} indicating if the function must print #' messages when running. #' #' @return The function returns \code{0L} when successful. #' #' @examples #' #' ## Current directory #' dataDir <- file.path(tempdir()) #' #' ## Copy required file into current directory #' file.copy(from=file.path(system.file("extdata/tests", package="RAIDS"), #' "ex1.txt.gz"), to=dataDir) #' #' ## The data.frame containing the information about the study #' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform" #' ## The entries should be strings, not factors (stringsAsFactors=FALSE) #' studyDF <- data.frame(study.id = "MYDATA", #' study.desc = "Description", #' study.platform = "PLATFORM", #' stringsAsFactors = FALSE) #' #' ## The data.frame containing the information about the samples #' ## The entries should be strings, not factors (stringsAsFactors=FALSE) #' samplePED <- data.frame(Name.ID=c("ex1", "ex2"), #' Case.ID=c("Patient_h11", "Patient_h12"), #' Diagnosis=rep("Cancer", 2), #' Sample.Type=rep("Primary Tumor", 2), #' Source=rep("Databank B", 2), stringsAsFactors=FALSE) #' rownames(samplePED) <- samplePED$Name.ID #' #' ## List of SNV positions #' listPositions <- data.frame(snp.chromosome=c(rep(1, 10)), #' snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527, #' 3469737, 3471497, 3471565, 3471618)) #' #' ## Append genotype information to the Profile GDS file #' result <- RAIDS:::generateProfileGDS(profileFile=file.path(dataDir, "ex1.txt.gz"), #' profileName="ex1", listPos=listPositions, #' offset=-1, minCov=10, minProb=0.999, seqError=0.001, #' dfPedProfile=samplePED, batch=1, studyDF=studyDF, #' pathProfileGDS=dataDir, genoSource="snp-pileup", #' verbose=FALSE) #' #' ## The function returns OL when successful #' result #' #' ## The Profile GDS file 'ex1.gds' has been created in the #' ## specified directory #' list.files(dataDir) #' #' ## Unlink Profile GDS file (created for demo purpose) #' unlink(file.path(dataDir, "ex1.gds")) #' unlink(file.path(dataDir, "ex1.txt.gz")) #' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn write.gdsn openfn.gds #' @importFrom stats qbinom #' @importFrom utils read.csv #' @encoding UTF-8 #' @keywords internal generateProfileGDS <- function(profileFile, profileName, listPos, offset, minCov=10, minProb=0.999, seqError=0.001, dfPedProfile, batch, studyDF, pathProfileGDS,
9969cfc7	genoSource, paramProfileGDS, verbose) {
dd2ee042	# File with the description of the SNP keep # if(genoSource == "VCF"){ # listMat <- dir(pathGeno, pattern = ".+.vcf.gz") # listSampleFile <- gsub(".vcf.gz", "", listMat) # }else{ # listMat <- dir(pathGeno, pattern = ".+.txt.gz") # listSampleFile <- gsub(".txt.gz", "", listMat) # }
2eb8e602
dd2ee042	# for(i in seq_len(length(listSamples))) { #pos <- which(listSampleFile == listSamples[i]) if(verbose) { message("generateProfileGDS start ", " ", Sys.time()) } # if(length(pos) == 1) { # if(genoSource == "snp-pileup") { matSample <- readSNVPileupFile(profileFile, offset) } else if(genoSource == "generic") { matSample <- readSNVFileGeneric(profileFile, offset) } else if(genoSource == "VCF") { # tmpProfile <- gsub(".vcf.gz", "",listMat[pos]) matSample <- readSNVVCF(profileFile, profileName=profileName, offset) } else if(genoSource == "bam"){
2eb8e602
9969cfc7	matSample <- readSNVBAM(fileName=profileFile, varSelected=listPos, paramSNVBAM=paramProfileGDS,
2eb8e602	offset,
9969cfc7	verbose=verbose)
3ab308bd	# listPos <- do.call(rbind, listPos)
9969cfc7	colnames(listPos)[1:2] <- c("snp.chromosome", "snp.position")
2eb8e602
dd2ee042	} # matAll <- merge(matSample[,c( "Chromosome", "Position", # "File1R", "File1A", # "count" )], # listPos, # by.x = c("Chromosome", "Position"), # by.y = c("snp.chromosome", "snp.position"), # all.y = TRUE, # all.x = FALSE) # # below same as the merge above but faster
2eb8e602	if(verbose) {message("End read ", Sys.time())} g <- as.matrix(rep(-1, nrow(listPos)))
dd2ee042	z <- cbind(c(listPos$snp.chromosome, matSample$Chromosome, matSample$Chromosome), c(listPos$snp.position, matSample$Position, matSample$Position), c(rep(1,nrow(listPos)), rep(0,nrow(matSample)), rep(2,nrow(matSample))), c(rep(0,nrow(listPos)), matSample[, "File1R"], -1 * matSample[, "File1R"]), c(rep(0,nrow(listPos)), matSample[, "File1A"], -1 * matSample[, "File1A"]), c(rep(0,nrow(listPos)), matSample[, "count"], -1 * matSample[, "count"])) rm(matSample) z <- z[order(z[,1], z[,2], z[,3]),] matAll <- data.frame(Chromosome=z[z[, 3] == 1, 1], Position=z[z[, 3] == 1, 2], File1R=cumsum(z[, 4])[z[, 3] == 1], File1A=cumsum(z[,5])[z[, 3] == 1], count=cumsum(z[, 6])[z[, 3] == 1]) rm(z) if(is.null(pathProfileGDS)){ stop("pathProfileGDS is NULL in ", "generateGDS1KGgenotypeFromSNPPileup\n") } else{ if(! dir.exists(pathProfileGDS)) { dir.create(pathProfileGDS) } } fileGDSSample <- file.path(pathProfileGDS, paste0(profileName, ".gds")) if(file.exists(fileGDSSample)) { gdsSample <- openfn.gds(fileGDSSample, readonly=FALSE) } else{ gdsSample <- createfn.gds(fileGDSSample) } if (! "Ref.count" %in% ls.gdsn(gdsSample)) { var.Ref <- add.gdsn(gdsSample, "Ref.count", matAll$File1R, valdim=c( nrow(listPos), 1), storage="sp.int16") var.Alt <- add.gdsn(gdsSample, "Alt.count", matAll$File1A, valdim=c( nrow(listPos), 1), storage="sp.int16") var.Count <- add.gdsn(gdsSample, "Total.count", matAll$count, valdim=c( nrow(listPos), 1), storage="sp.int16") } else { # you must append var.Ref <- append.gdsn(index.gdsn(gdsSample, "Ref.count"), matAll$File1R) var.Alt <- append.gdsn(index.gdsn(gdsSample, "Alt.count"), matAll$File1A) var.Count <- append.gdsn(index.gdsn(gdsSample, "Total.count"), matAll$count) } listSampleGDS <- addStudyGDSSample(gdsSample, pedProfile=dfPedProfile, batch=batch, listSamples=c(profileName), studyDF=studyDF, verbose=verbose) listCount <- table(matAll$count[matAll$count >= minCov]) cutOffA <- data.frame(count=unlist(vapply(as.integer(names(listCount)), FUN=function(x, minProb, eProb){ return(max(2,qbinom(minProb, x, eProb))) }, FUN.VALUE=numeric(1), minProb=minProb, eProb=2 * seqError)), allele=unlist(vapply(as.integer(names(listCount)), FUN=function(x, minProb, eProb){ return(max(2,qbinom(minProb, x, eProb))) }, FUN.VALUE=numeric(1), minProb=minProb, eProb=seqError))) row.names(cutOffA) <- names(listCount) # Initialize the genotype array at -1 # Select the position where the coverage of the 2 alleles is enough listCov <- which(rowSums(matAll[, c("File1R", "File1A")]) >= minCov) matAllC <- matAll[listCov,] # The difference depth - (nb Ref + nb Alt) can be realistically # explain by sequencing error listCov <- listCov[(matAllC$count - (matAllC$File1R + matAllC$File1A)) < cutOffA[as.character(matAllC$count), "count"]] matAllC <- matAll[listCov,] rm(matAll) g <- as.matrix(rep(-1, nrow(listPos))) # The sample is homozygote if the other known allele have a # coverage of 0 g[listCov][which(matAllC$File1A == 0)] <- 0 g[listCov][which(matAllC$File1R == 0)] <- 2 # The sample is heterozygote if explain the coverage of # the minor allele by sequencing error is not realistic. g[listCov][which(matAllC$File1A >= cutOffA[as.character(matAllC$count), "allele"] & matAllC$File1R >= cutOffA[as.character(matAllC$count), "allele"])] <- 1 #g <- as.matrix(g) if("geno.ref" %in% ls.gdsn(gdsSample)){ var.geno <- index.gdsn(gdsSample, "geno.ref") compression.gdsn(var.geno, compress="") append.gdsn(var.geno, g, check=TRUE) compression.gdsn(var.geno, compress="LZMA_RA.fast") readmode.gdsn(var.geno) }else{ var.geno <- add.gdsn(gdsSample, "geno.ref", valdim=c(length(g), 1), g, storage="bit2", compress = "LZMA_RA.fast") readmode.gdsn(var.geno) } rm(g) closefn.gds(gdsfile=gdsSample) if (verbose) { message("End ", profileName, " ", Sys.time()) } ## Success return(0L) }
be6ebeca
a3afa570	#' @title Add information related to a specific study and specific samples #' into a GDS Sample file #' #' @description This function add entries related to 1) a specific study and #' 2) specific samples into a GDS Sample file. #' The study information is appended to the GDS Sample file "study.list" node #' when the node is already present in the file. Otherwise, the node is #' created and then, the information is added. #' The sample information for all selected samples is appended to the GDS #' Sample file "study.annot" node #' when the node is already present in the file. Otherwise, the node is #' created and then, the information is added. #'
aa3f0a99	#' @param gdsProfile an object of class
a3afa570	#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file. #'
3104ba2c	#' @param pedProfile a \code{data.frame} with the sample information. The
a3afa570	#' \code{data.frame} must have the columns: #' "Name.ID", "Case.ID", "Sample.Type", "Diagnosis" and "Source". #' The unique sample identifier of the \code{data.frame} is the "Name.ID" #' column and the row names of the \code{data.frame} must be the "Name.ID" #' values. #' #' @param batch a \code{integer} corresponding the batch associated to the #' study. #' #' @param listSamples a \code{vector} of \code{character} string representing #' the samples (samples identifiers) that are saved into the GDS. All the #' samples must be present in the 'pdeDF' \code{data.frame}.
b6cc5ca4	#' If \code{NULL}, all samples present in the \code{dfPedProfile} are used.
a3afa570	#' #' @param studyDF a \code{data.frame} with at least the 3 columns: "study.id", #' "study.desc" and "study.platform". The three columns are in character #' string format (no factor). #' #' @param verbose a \code{logical} indicating if messages should be printed #' to show how the different steps in the function. #' #' @return a \code{vector} of \code{character} strings representing the sample #' identifiers that have been saved in the GDS Sample file. #' #' @examples #'
1c304a2a	#' ## Required library #' library(gdsfmt) #'
1434071b	#' ## Create a temporary GDS file in an current directory
5f024f9e	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_11.gds")
a3afa570	#'
5f024f9e	#' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath)
a3afa570	#'
5f024f9e	#' ## Create a PED data frame with sample information #' ped1KG <- data.frame(Name.ID=c("1KG_sample_01", "1KG_sample_02"),
7b9d4130	#' Case.ID=c("1KG_sample_01", "1KG_sample_02"), #' Sample.Type=rep("Reference", 2), Diagnosis=rep("Reference", 2), #' Source=rep("IGSR", 2), stringsAsFactors=FALSE)
a3afa570	#'
5f024f9e	#' ## Create a Study data frame with information about the study #' ## All samples are associated to the same study #' studyInfo <- data.frame(study.id="Ref.1KG",
7b9d4130	#' study.desc="Unrelated samples from 1000 Genomes", #' study.platform="GRCh38 1000 genotypes", #' stringsAsFactors=FALSE)
a3afa570	#'
5f024f9e	#' ## Add the sample information to the GDS Sample file #' ## The information for all samples is added (listSamples=NULL) #' RAIDS:::addStudyGDSSample(gdsProfile=tmpGDS, pedProfile=ped1KG, batch=1,
7b9d4130	#' listSamples=NULL, studyDF=studyInfo, verbose=FALSE)
a3afa570	#'
5f024f9e	#' ## Read study information from GDS Sample file #' read.gdsn(index.gdsn(node=tmpGDS, path="study.list"))
a3afa570	#'
5f024f9e	#' ## Read sample information from GDS Sample file #' read.gdsn(index.gdsn(node=tmpGDS, path="study.annot"))
7b9d4130	#'
5f024f9e	#' ## Close GDS file #' closefn.gds(gdsfile=tmpGDS)
7b9d4130	#'
5f024f9e	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
7b9d4130	#'
a3afa570	#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn append.gdsn #' @encoding UTF-8 #' @keywords internal
3104ba2c	addStudyGDSSample <- function(gdsProfile, pedProfile, batch, listSamples,
16faec6d	studyDF, verbose) {
a3afa570	## Used only the selected samples (all when listSamples == NULL) if(!(is.null(listSamples))) { if(length(listSamples) == length(intersect(listSamples,
d135565e	pedProfile$Name.ID))) {
79712a42	# if we remove the names we should manage the listSamples order
d135565e	# something like
79712a42	tmp <- order(as.character(listSamples)) pedProfile <- pedProfile[which(pedProfile$Name.ID %in% listSamples), ] pedProfile <- pedProfile[order(pedProfile$Name.ID), ][order(tmp),]
a3afa570	} else { stop("List of samples includes samples not present in ",
3104ba2c	"the \'pedProfile\' data frame. The sample names must be ",
16faec6d	"present in the \'Name.ID\' column. The row names should", " be assigned the \'Name.ID\'.")
a3afa570	} } else {
3104ba2c	listSamples <- pedProfile$Name.ID
a3afa570	} ## Create the study data frame that is going to be saved
b3a84582	df <- data.frame(study.id=studyDF$study.id, study.desc=studyDF$study.desc, study.platform=studyDF$study.platform, stringsAsFactors=FALSE)
a3afa570	## Append study information to "study.list" when node already present ## Otherwise, create node and add study information into it
aa3f0a99	if(! "study.list" %in% ls.gdsn(gdsProfile)) {
a3afa570	## Create study node and add study information into GDS Sample file
aa3f0a99	add.gdsn(gdsProfile, "study.list", df)
a3afa570
b3a84582	## Create data frame containing sample information and add it to GDS
3104ba2c	study.annot <- data.frame(data.id=pedProfile[, "Name.ID"], case.id=pedProfile[, "Case.ID"], sample.type=pedProfile[, "Sample.Type"], diagnosis=pedProfile[, "Diagnosis"], source=pedProfile[, "Source"], study.id=rep(studyDF$study.id, nrow(pedProfile)),
16faec6d	stringsAsFactors=FALSE)
aa3f0a99	add.gdsn(gdsProfile, "study.annot", study.annot)
a3afa570	if(verbose) { message("study.annot DONE ", Sys.time()) } } else{ ## Append study information to existing node
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.list/study.id"),
a3afa570	df$study.id, check=TRUE)
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.list/study.desc"),
a3afa570	df$study.desc, check=TRUE)
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.list/study.platform"),
a3afa570	df$study.platform, check=TRUE) ## Create data frame containing sample information
3104ba2c	study.annot <- data.frame(data.id=pedProfile[, "Name.ID"], case.id=pedProfile[, "Case.ID"], sample.type=pedProfile[, "Sample.Type"], diagnosis=pedProfile[, "Diagnosis"], source=pedProfile[, "Source"], study.id=rep(studyDF$study.id, nrow(pedProfile)),
16faec6d	stringsAsFactors=FALSE)
a3afa570	## Append sample information to existing node
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.annot/data.id"),
a3afa570	study.annot$data.id, check=TRUE)
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.annot/case.id"),
a3afa570	study.annot$case.id, check=TRUE)
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.annot/sample.type"),
a3afa570	study.annot$sample.type, check=TRUE)
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.annot/diagnosis"),
a3afa570	study.annot$diagnosis, check=TRUE)
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.annot/source"),
a3afa570	study.annot$source, check=TRUE)
aa3f0a99	append.gdsn(index.gdsn(gdsProfile, "study.annot/study.id"),
a3afa570	study.annot$study.id, check=TRUE) if(verbose) { message("study.annot DONE ", Sys.time()) } }
b3a84582	## Return the vector of profile IDs that have been added to Profile GDS file
3104ba2c	return(pedProfile[,"Name.ID"])
a3afa570	}
be6ebeca	#' @title Identity-by-descent (IBD) analysis #' #' @description This function calculates the IDB coefficients by KING method #' of moment using the #' \code{\link[SNPRelate:snpgdsIBDKING]{SNPRelate::snpgdsIBDKING}} #' function. #' #' @param gds an object of class #' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, an #' opened SNP GDS file. #' #' @param profileID a \code{vector} of \code{character} strings representing #' the samples to keep for the analysis. If \code{NULL}, all samples are used. #' Default: \code{NULL}. #' #' @param snpID a \code{vector} of \code{character} strings representing #' the SNPs to keep for the analysis. If \code{NULL}, all SNPs are used. #' Default: \code{NULL}. #' #' @param maf a single \code{numeric} representing the threshold for the minor #' allele frequency. Only the SNPs with ">= maf" are retained. #' Default: \code{0.05}. #' #' @param verbose a \code{logical} indicating if information is shown during #' the process in the \code{\link[SNPRelate]{snpgdsIBDKING}}() function. #' #' @return a \code{list} containing:
38d46f1e	#' \describe{
be6ebeca	#' \item{sample.id}{a \code{character} string representing the sample #' ids used in the analysis} #' \item{snp.id}{a \code{character} string representing the SNP ids #' used in the analysis} #' \item{k0}{a \code{numeric}, the IBD coefficient, the probability of #' sharing zero IBD} #' \item{k1}{a \code{numeric}, the IBD coefficient, the probability of #' sharing one IBD} #' \item{IBS0}{a \code{numeric}, the proportion of SNPs with zero IBS} #' \item{kinship}{a \code{numeric}, the proportion of SNPs with zero IBS, #' if the parameter kinship=TRUE} #' } #' #' @examples #' #' ## Required #' library(SNPRelate) #' #' ## Open an example dataset (HapMap) #' genoFile <- snpgdsOpen(snpgdsExampleFileName()) #' #' ## Extract CEU population #' samples <- read.gdsn(index.gdsn(genoFile, "sample.id")) #' CEU <- samples[ #' read.gdsn(index.gdsn(genoFile, "sample.annot/pop.group"))=="CEU"] #' #' ## Infer the presence of population stratification #' ibd.robust <- RAIDS:::runIBDKING(gds=genoFile, profileID=CEU, snpID=NULL, #' maf=0.05, verbose=FALSE) #' #' ## close the genotype file #' snpgdsClose(genoFile) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom SNPRelate snpgdsIBDKING #' @encoding UTF-8 #' @keywords internal runIBDKING <- function(gds, profileID=NULL, snpID=NULL, maf=0.05, verbose) { ## Calculate IBD coefficients by KING method of moment ibd.robust <- snpgdsIBDKING(gdsobj=gds, sample.id=profileID,
d899f5f8	snp.id=snpID, maf=maf, type="KING-robust", verbose=verbose) ## Return result
be6ebeca	return(ibd.robust) } #' @title SNP pruning based on linkage disequilibrium (LD) #' #' @description This function is a wrapper for the #' \code{\link[SNPRelate]{snpgdsLDpruning}}() function that generates a pruned #' subset of SNPs that are in approximate linkage equilibrium. #' #' @param gds an \code{object} of class #' \code{\link[SNPRelate]{SNPGDSFileClass}}, a SNP GDS file. #' #' @param method a \code{character} string that represents the method that will #' be used to calculate the linkage disequilibrium in the #' \code{\link[SNPRelate]{snpgdsLDpruning}}() function. The 4 possible values
b3a84582	#' are: "corr", "r", "dprime" and "composite".
be6ebeca	#' #' @param listSamples a \code{vector} of \code{character} strings #' corresponding to the sample identifiers used in LD pruning done by the #' \code{\link[SNPRelate]{snpgdsLDpruning}}() function. If \code{NULL}, all #' samples are used. Default: \code{NULL}. #' #' @param listKeep a \code{vector} of SNVs identifiers specifying selected; #' if \code{NULL}, all SNVs are used in the #' \code{\link[SNPRelate]{snpgdsLDpruning}} function. Default: \code{NULL}. #' #' @param slideWindowMaxBP a single positive \code{integer} that represents #' the maximum basepairs (bp) in the sliding window. This parameter is used #' for the LD pruning done in the \code{\link[SNPRelate]{snpgdsLDpruning}}() #' function. #' Default: \code{500000L}. #' #' @param thresholdLD a single \code{numeric} value that represents the LD #' threshold used in the \code{\link[SNPRelate]{snpgdsLDpruning}} function. #' Default: \code{sqrt(0.1)}. #' #' @param np a single positive \code{integer} specifying the number of #' threads to be used. Default: \code{1L}. #' #' @param verbose a \code{logical} indicating if information is shown during #' the process in the \code{\link[SNPRelate]{snpgdsLDpruning}}() function. #' #' @return a \code{list} of SNP identifiers stratified by chromosomes as #' generated by \code{\link[SNPRelate]{snpgdsLDpruning}} function. #' #' @details #' #' The SNP pruning is based on linkage disequilibrium (LD) and is done by the #' \code{\link[SNPRelate]{snpgdsLDpruning}}() function in the #' SNPRelate package (https://bioconductor.org/packages/SNPRelate/). #' #' @examples #'
1434071b	#' ## Required #' library(SNPRelate) #'
be6ebeca	#' ## Open an example dataset (HapMap) #' genoFile <- snpgdsOpen(snpgdsExampleFileName()) #' #' ## Fix seed to get reproducible results #' set.seed(1000) #' #' ## Get linkage Disequilibrium (LD) based SNP pruning #' snpSet <- RAIDS:::runLDPruning(gds=genoFile, verbose=FALSE) #' names(snpSet) #' #' ## Get SNP ids #' snp.id <- unlist(unname(snpSet)) #' #' ## Close the genotype file #' snpgdsClose(genoFile) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' #' @importFrom SNPRelate snpgdsOpen snpgdsLDpruning #' @importFrom gdsfmt closefn.gds #' @encoding UTF-8 #' @keywords internal
b3a84582	runLDPruning <- function(gds, method,
be6ebeca	listSamples=NULL, listKeep=NULL, slideWindowMaxBP=500000L, thresholdLD=sqrt(0.1), np=1L, verbose) { ## Call SNP LD pruning
d899f5f8	result <- snpgdsLDpruning(gdsobj=gds, method="corr", sample.id=listSamples, snp.id=listKeep, slide.max.bp=slideWindowMaxBP, ld.threshold=thresholdLD, num.thread=np, verbose=verbose)
be6ebeca	return(result) }
3978de30	#' @title Append fields related to samples into a GDS file #' #' @description This function appends the fields related to samples into #' a GDS file. The information is extracted from the \code{data.frame} passed #' to the function and is added to the "sample.annot" and "sample.id" nodes. #' The "sample.id" and "sample.annot" nodes must already exist. #' If the samples are part of a study, the function #' addStudyGDSSample() must be used. #'
aa3f0a99	#' @param gdsReference an object of class
3978de30	#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file. #'
32e75790	#' @param dfPedReference a \code{data.frame} with the information about #' the sample(s).
3978de30	#' The \code{data.frame} must have the columns: "sample.id", "Name.ID", "sex", #' "pop.group" and "superPop". The unique identifier for the sample(s) is #' the "Name.ID" column and the row names of the \code{data.frame} must #' correspond to the "Name.ID" column. #' #' @param batch a \code{integer} representing the batch identifier. #' #' @param listSamples a \code{vector} of \code{character} string with the #' selected sample(s). If \code{NULL}, all samples are used. #' #' @param verbose a \code{logical} indicating if messages should be printed #' to show how the different steps in the function. Default: \code{TRUE}. #' #' @return The integer \code{0L} when successful. #' #' @examples #'
1c304a2a	#' ## Required library #' library(gdsfmt) #'
3978de30	#' ## Create a temporary GDS file in an test directory
5f024f9e	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_03.gds")
3978de30	#'
5f024f9e	#' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath)
7b9d4130	#'
5f024f9e	#' ## Create "sample.id" node (the node must be present) #' add.gdsn(node=tmpGDS, name="sample.id", val=c("sample_01",
7b9d4130	#' "sample_02")) #'
5f024f9e	#' ## Create "sample.annot" node (the node must be present) #' add.gdsn(node=tmpGDS, name="sample.annot", val=data.frame(
7b9d4130	#' Name.ID=c("sample_01", "sample_02"), #' sex=c(1,1), # 1:Male 2: Female #' pop.group=c("ACB", "ACB"), #' superPop=c("AFR", "AFR"), #' batch=c(1, 1), #' stringsAsFactors=FALSE)) #'
5f024f9e	#' sync.gds(gdsfile=tmpGDS)
7b9d4130	#'
5f024f9e	#' ## Create a data.frame with information about samples #' sample_info <- data.frame(Name.ID=c("sample_04", "sample_05",
7b9d4130	#' "sample_06"), #' sex=c(1,2,1), # 1:Male 2: Female #' pop.group=c("ACB", "ACB", "ACB"), #' superPop=c("AFR", "AFR", "AFR"), #' stringsAsFactors=FALSE) #'
5f024f9e	#' ## The row names must be the sample identifiers #' rownames(sample_info) <- sample_info$Name.ID
7b9d4130	#'
5f024f9e	#' ## Add information about 2 samples to the GDS file #' RAIDS:::appendGDSRefSample(gdsReference=tmpGDS,
7b9d4130	#' dfPedReference=sample_info, #' batch=2, listSamples=c("sample_04", "sample_06"), verbose=FALSE) #'
5f024f9e	#' ## Read sample identifier list #' ## Only "sample_04" and "sample_06" should have been added #' read.gdsn(index.gdsn(node=tmpGDS, path="sample.id"))
7b9d4130	#'
5f024f9e	#' ## Read sample information from GDS file #' ## Only "sample_04" and "sample_06" should have been added #' read.gdsn(index.gdsn(node=tmpGDS, path="sample.annot"))
3978de30	#'
5f024f9e	#' ## Close GDS file #' closefn.gds(gdsfile=tmpGDS)
3978de30	#'
5f024f9e	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
3978de30	#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn append.gdsn #' @encoding UTF-8 #' @keywords internal
32e75790	appendGDSRefSample <- function(gdsReference, dfPedReference, batch=1, listSamples=NULL, verbose=TRUE) {
3978de30	## Only keep selected samples if(!(is.null(listSamples))){
b6cc5ca4	dfPedReference <- dfPedReference[listSamples,]
3978de30	} ## Append sample identifiers to the "sample.id" node
aa3f0a99	sampleGDS <- index.gdsn(gdsReference, "sample.id")
b6cc5ca4	append.gdsn(sampleGDS, val=dfPedReference$Name.ID, check=TRUE)
3978de30	## Create the data.frame with the sample information
b6cc5ca4	samp.annot <- data.frame(sex = dfPedReference[, "sex"], pop.group=dfPedReference[, "pop.group"], superPop=dfPedReference[, "superPop"], batch=rep(batch, nrow(dfPedReference)),
3978de30	stringsAsFactors=FALSE) if(verbose) { message("Annot") } ## Append data.frame to "sample.annot" node
aa3f0a99	curAnnot <- index.gdsn(gdsReference, "sample.annot/sex")
3978de30	append.gdsn(curAnnot, samp.annot$sex, check=TRUE)
aa3f0a99	curAnnot <- index.gdsn(gdsReference, "sample.annot/pop.group")
3978de30	append.gdsn(curAnnot, samp.annot$pop.group, check=TRUE)
aa3f0a99	curAnnot <- index.gdsn(gdsReference, "sample.annot/superPop")
3978de30	append.gdsn(curAnnot, samp.annot$superPop, check=TRUE)
aa3f0a99	curAnnot <- index.gdsn(gdsReference, "sample.annot/batch")
3978de30	append.gdsn(curAnnot, samp.annot$batch, check=TRUE) if(verbose) { message("Annot done") } return(0L) }
b3a84582	#' @title Add the pruned.study entry related to the SNV dataset in the #' Profile GDS file #' #' @description This function adds the names of the SNVs into the node called #' "pruned.study" in GDS #' Sample file. If a "pruned.study" entry is already present, the entry is #' deleted and a new entry is created. #'
32e75790	#' @param gdsProfile an object of class \link[gdsfmt]{gds.class} (a GDS file), #' the opened Profile GDS file.
b3a84582	#' #' @param pruned a \code{vector} of \code{character} string representing the #' name of the SNVs. #' #' @return The integer \code{0L} when successful. #' #' @examples #'
1c304a2a	#' ## Required library #' library(gdsfmt) #' #' ## Create a temporary GDS file in an test directory
5f024f9e	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_1.gds")
b3a84582	#'
5f024f9e	#' ## Create and open the GDS file #' tmpGDS <- createfn.gds(filename=gdsFilePath)
b3a84582	#'
5f024f9e	#' ## Vector of low allelic fraction #' study <- c("s19222", 's19588', 's19988', 's20588', 's23598')
b3a84582	#'
5f024f9e	#' ## Add segments to the GDS file #' RAIDS:::addGDSStudyPruning(gdsProfile=tmpGDS, pruned=study)
b3a84582	#'
5f024f9e	#' ## Read lap information from GDS file #' read.gdsn(index.gdsn(node=tmpGDS, path="pruned.study"))
b3a84582	#'
5f024f9e	#' ## Close GDS file #' closefn.gds(gdsfile=tmpGDS)
851a4988	#'
5f024f9e	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
851a4988	#'
b3a84582	#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn #' @encoding UTF-8 #' @keywords internal
aa3f0a99	addGDSStudyPruning <- function(gdsProfile, pruned) {
b3a84582	## Delete the pruned.study entry if present in the Profile GDS file
aa3f0a99	if("pruned.study" %in% ls.gdsn(gdsProfile)) { delete.gdsn(index.gdsn(node=gdsProfile, "pruned.study"))
b3a84582	} ## Create the pruned.study node in the Profile GDS file
aa3f0a99	varPruned <- add.gdsn(node=gdsProfile, name="pruned.study", val=pruned)
b3a84582	# Write the data cached in memory to the Profile GDS file
aa3f0a99	sync.gds(gdsfile=gdsProfile)
b3a84582	return(0L) }
2170193e	#' @title Add information related to low allelic fraction associated to #' the SNV dataset for a specific sample into a GDS file #' #' @description The function adds the information related to low allelic #' fraction #' associated to the SNV dataset for a specific sample into a #' GDS file, more specifically, in the "lap" node. The "lap" node must #' already be present in the GDS file. #'
aa3f0a99	#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}}
2170193e	#' (a GDS file), a GDS file. #' #' @param snpLap a \code{vector} of \code{numeric} value representing the #' low allelic fraction for each SNV present in the SNV dataset. The #' values should be between \code{0} and \code{0.50}. The #' length of the \code{vector} should correspond to the number of SNVs #' present in the "snp.id" entry of the GDS sample file. #' #' @return The integer \code{0L} when successful. #' #' @examples #'
1c304a2a	#' ## Required library #' library(gdsfmt) #'
1434071b	#' ## Create a temporary GDS file
5f024f9e	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP.gds")
2170193e	#'
5f024f9e	#' ## Create and open the GDS file #' gdsFile <- createfn.gds(filename=gdsFilePath)
2170193e	#'
5f024f9e	#' ## Create a "lap" node #' add.gdsn(node=gdsFile, name="lap", val=rep(10L, 12)) #' sync.gds(gdsFile)
2170193e	#'
5f024f9e	#' ## Vector of low allelic fraction #' lap <- c(0.1, 0.23, 0.34, 0.00, 0.12, 0.11, 0.33, 0.55)
2170193e	#'
5f024f9e	#' ## Add segments to the GDS file #' RAIDS:::addUpdateLap(gdsProfile=gdsFile, snpLap=lap)
2170193e	#'
5f024f9e	#' ## Read lap information from GDS file #' read.gdsn(index.gdsn(node=gdsFile, path="lap"))
2170193e	#'
5f024f9e	#' ## Close GDS file #' closefn.gds(gdsfile=gdsFile)
851a4988	#'
5f024f9e	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
851a4988	#'
2170193e	#' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn #' @encoding UTF-8 #' @keywords internal
aa3f0a99	addUpdateLap <- function(gdsProfile, snpLap) {
2170193e
aa3f0a99	snpLap2 <- write.gdsn(index.gdsn(gdsProfile, "lap"), snpLap)
2170193e
aa3f0a99	sync.gds(gdsProfile)
2170193e	return(0L) }
040f95da	#' @title Extract the block identifiers for a list of SNVs #' #' @description The function uses the GDS Reference Annotation file to extract #' the unique block identifiers for a list of SNVs. The block type that is #' going to be used to extract the information has to be provided by the #' user. #' #' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), the opened Reference SNV Annotation GDS file. #' #' @param snpIndex a \code{vectcor} of \code{integer} representing the #' indexes of the SNVs of interest. #' #' @param blockTypeID a \code{character} string corresponding to the block #' type used to extract the block identifiers. The block type must be #' present in the GDS Reference Annotation file. #' #' @return a \code{vector} of \code{numeric} corresponding to the #' block identifiers for the SNVs of interest. #' #' @examples #' #' # Required library #' library(gdsfmt) #' #' ## Path to the demo 1KG Annotation GDS file located in this package #' dataDir <- system.file("extdata", package="RAIDS") #'
3751711a	#' path1KG <- file.path(dataDir, "tests") #' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds")
040f95da	#' #' gdsRefAnnotation <- openfn.gds(fileAnnotGDS) #' #' ## The indexes for the SNVs of interest #' snpIndex <- c(1,3,5,6,9) #' #' ## Extract the block identifiers for the SNVs represented by their indexes #' ## for the block created using the genes from Hsapiens Ensembl v86 #' RAIDS:::getBlockIDs(gdsRefAnnot=gdsRefAnnotation, snpIndex=snpIndex, #' blockTypeID="GeneS.Ensembl.Hsapiens.v86") #' #' closefn.gds(gdsRefAnnotation) #' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn read.gdsn #' @encoding UTF-8 #' @keywords internal getBlockIDs <- function(gdsRefAnnot, snpIndex, blockTypeID) { block.annot <- read.gdsn(index.gdsn(gdsRefAnnot, "block.annot")) pos <- which(block.annot$block.id == blockTypeID) if(length(pos) != 1) { stop("The following block type is not found in the ",
6fc47f63	"GDS Annotation file: \'", blockTypeID, "\'")
040f95da	} b <- read.gdsn(index.gdsn(gdsRefAnnot, "block"), start=c(1, pos),
6fc47f63	count = c(-1, 1))[snpIndex]
040f95da	return(b) }
4cf4bc51	#' @title Add information related to segments associated to the SNV #' dataset for a specific sample into a GDS file #' #' @description The function adds the information related to segments #' associated to the SNV dataset for a specific sample into a #' GDS file, more specifically, in the "segment" node. If the "segment" node #' already exists, the previous information is erased. #' #' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}} #' (a GDS file), a GDS Sample file. #' #' @param snpSeg a \code{vector} of \code{integer} representing the segment #' identifiers associated to each SNV selected for the specific sample. The #' length of the \code{vector} should correspond to the number of SNVs #' present in the "snp.id" entry of the GDS sample file. #' #' @return The integer \code{0L} when successful. #' #' @examples #' #' ## Required library #' library(gdsfmt) #' #' ## Temporary GDS file
5f024f9e	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP.gds")
4cf4bc51	#'
5f024f9e	#' ## Create and open the GDS file #' GDS_file_tmp <- createfn.gds(filename=gdsFilePath)
4cf4bc51	#'
5f024f9e	#' ## Vector of segment identifiers #' segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L)
4cf4bc51	#'
5f024f9e	#' ## Add segments to the GDS file #' RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snpSeg=segments)
4cf4bc51	#'
5f024f9e	#' ## Read segments information from GDS file #' read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment"))
4cf4bc51	#'
5f024f9e	#' ## Close GDS file #' closefn.gds(gdsfile=GDS_file_tmp)
4cf4bc51	#'
5f024f9e	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
4cf4bc51	#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn #' @encoding UTF-8 #' @keywords internal addUpdateSegment <- function(gdsProfile, snpSeg) { if("segment" %in% ls.gdsn(gdsProfile)) { snpLap <- write.gdsn(index.gdsn(gdsProfile, "segment"), snpSeg) } else{ snpLap <- add.gdsn(gdsProfile, "segment", snpSeg, storage="uint32") } sync.gds(gdsProfile) ## Successful return(0L) } #' @title Append sample names into a GDS file #' #' @description This function append the sample identifiers into the #' "samples.id" node of a GDS file. #' #' @param gds an object of class #' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file. #'
38d46f1e	#' @param listSamples a \code{vector} of \code{character} string representing
4cf4bc51	#' the sample identifiers to be added to GDS file. #' #' #' @return The integer \code{0L} when successful. #' #' @examples #' #' ## Required library #' library(gdsfmt) #' #' ## Temporary GDS file in current directory
5f024f9e	#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_04.gds")
4cf4bc51	#'
5f024f9e	#' ## Create and open the GDS file #' GDS_file_tmp <- createfn.gds(filename=gdsFilePath)
4cf4bc51	#'
5f024f9e	#' ## Create "sample.id" node (the node must be present) #' add.gdsn(node=GDS_file_tmp, name="sample.id", val=c("sample_01",
4cf4bc51	#' "sample_02")) #'
5f024f9e	#' sync.gds(gdsfile=GDS_file_tmp)
4cf4bc51	#'
5f024f9e	#' ## Add information about 2 samples to the GDS file #' RAIDS:::appendGDSSampleOnly(gds=GDS_file_tmp,
4cf4bc51	#' listSamples=c("sample_03", "sample_04")) #'
5f024f9e	#' ## Read sample identifier list #' ## Only "sample_03" and "sample_04" should have been added #' read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.id"))
4cf4bc51	#'
5f024f9e	#' ## Close GDS file #' closefn.gds(gdsfile=GDS_file_tmp)
4cf4bc51	#'
5f024f9e	#' ## Delete the temporary GDS file #' unlink(x=gdsFilePath, force=TRUE)
4cf4bc51	#' #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt index.gdsn append.gdsn #' @encoding UTF-8 #' @keywords internal appendGDSSampleOnly <- function(gds, listSamples) { sampleGDS <- index.gdsn(gds, "sample.id") append.gdsn(sampleGDS, val=listSamples, check=TRUE) return(0L) }
18056f22	#' @title Append information associated to ld blocks, as indexes, into the #' Population Reference SNV Annotation GDS file #' #' @description The function appends the information about the ld blocks into #' the Population Reference SNV Annotation GDS file. The information is #' extracted from the parameter listBlock. #' #' @param gds an object of class \link[gdsfmt]{gds.class} #' (GDS file), an opened Reference Annotation GDS file. #' #' @param listBlock a \code{array} of integer #' representing the linkage disequilibrium block for #' each SNV in the in the same order than the variant #' in Population reference dataset. #' #' @param blockName a \code{character} string representing the id of the block. #' The blockName should not exist in \'gdsRefAnnotFile\'. #' #' @param blockDesc a \code{character} string representing the description of #' the block. #' #' @return The integer \code{0L} when successful. #' #' @examples #' #' ## Required library for GDS #' library(gdsfmt) #' ## Path to the demo pedigree file is located in this package #' dataDir <- system.file("extdata", package="RAIDS") #' # ## Temporary file #' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds") #' #' #' file.copy(file.path(dataDir, "tests", #' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS) #' #' #' fileReferenceGDS <- file.path(dataDir, "tests", #' "ex1_good_small_1KG.gds") #' \donttest{ #' gdsRef <- openfn.gds(fileReferenceGDS) #' listBlock <- read.gdsn(index.gdsn(gdsRef, "snp.position")) #' listBlock <- rep(-1, length(listBlock)) #' closefn.gds(gdsRef) #' gdsAnnot1KG <- openfn.gds(fileAnnotGDS, readonly=FALSE) #' ## Append information associated to blocks #' RAIDS:::addGDS1KGLDBlock(gds=gdsAnnot1KG, #' listBlock=listBlock, #' blockName="blockEmpty", #' blockDesc="Example")
bdbca27f	#' #' closefn.gds(gdsAnnot1KG)
18056f22	#' #' gdsAnnot1KG <- openfn.gds(fileAnnotGDS) #' print(gdsAnnot1KG) #' #' closefn.gds(gdsAnnot1KG) #' } #' #' ## Remove temporary file #' unlink(fileAnnotGDS, force=TRUE) #' #' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz #' @importFrom gdsfmt add.gdsn index.gdsn ls.gdsn compression.gdsn #' @importFrom gdsfmt append.gdsn sync.gds #' @encoding UTF-8 #' @keywords internal addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) { blockAnnot <- data.frame(block.id=blockName, block.desc=blockDesc, stringsAsFactors=FALSE) if(! ("block.annot" %in% ls.gdsn(gds))) { varBlockAnnot <- add.gdsn(gds, "block.annot", blockAnnot) }else { curAnnot <- index.gdsn(gds, "block.annot/block.id") append.gdsn(curAnnot,blockAnnot$block.id) curAnnot <- index.gdsn(gds, "block.annot/block.desc") append.gdsn(curAnnot, blockAnnot$block.desc) } varBlock <- NULL if(! ("block" %in% ls.gdsn(gds))){ varBlock <- add.gdsn(gds, "block", valdim=c(length(listBlock), 1), listBlock, storage="int32", compress = "LZ4_RA") readmode.gdsn(varBlock) }else { if(is.null(varBlock)) { varBlock <- index.gdsn(gds, "block") varBlock <- compression.gdsn(varBlock, "") } append.gdsn(varBlock, listBlock) varBlock <- compression.gdsn(varBlock, "LZ4_RA") } sync.gds(gds) return(0L) }