dd76ad8e |
#' @title Initialization of the section related to the profile
|
be6ebeca |
#' information in the GDS file
#'
|
dd76ad8e |
#' @description This function initializes the section related to the profile
|
be6ebeca |
#' information in the GDS file. The information is extracted from
#' the \code{data.frame} passed to the function. The nodes "sample.id" and
#' "sample.annot" are created in the GDS file.
#'
|
48837112 |
#' @param gdsReference an object of class
|
be6ebeca |
#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file.
#'
|
32e75790 |
#' @param dfPedReference a \code{data.frame} containing the information
#' related to the
|
be6ebeca |
#' samples. It must have those columns: "sample.id", "Name.ID", "sex",
#' "pop.group", "superPop" and "batch". All columns, except "sex" and batch",
#' are \code{character} strings. The "batch" and "sex" columns are
#' \code{integer}. The unique identifier
#' of this \code{data.frame} is the "Name.ID" column. The row names of the
#' \code{data.frame} must correspond to the identifiers present in the
#' "Name.ID" column.
#'
#' @param listSamples a \code{vector} of \code{character} string representing
|
dd76ad8e |
#' the identifiers of the selected profiles. If \code{NULL}, all profiles are
|
be6ebeca |
#' selected. Default: \code{NULL}.
#'
#' @return a \code{vector} of \code{character} string with the identifiers of
|
dd76ad8e |
#' the profiles saved in the GDS file.
|
be6ebeca |
#'
#' @examples
#'
|
1c304a2a |
#' ## Required library
#' library(gdsfmt)
#'
|
4cf4bc51 |
#' ## Temporary GDS file in current directory
|
045e8817 |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_10.gds")
|
be6ebeca |
#'
|
045e8817 |
#' ## Create and open the GDS file
#' tmpGDS <- createfn.gds(filename=gdsFilePath)
|
be6ebeca |
#'
|
045e8817 |
#' ## Create "sample.annot" node (the node must be present)
#' pedInformation <- data.frame(sample.id=c("sample_01", "sample_02"),
|
0fdab8b8 |
#' Name.ID=c("sample_01", "sample_02"),
#' sex=c(1,1), # 1:Male 2: Female
#' pop.group=c("ACB", "ACB"),
#' superPop=c("AFR", "AFR"),
#' batch=c(1, 1),
#' stringsAsFactors=FALSE)
|
be6ebeca |
#'
|
045e8817 |
#' ## The row names must be the sample identifiers
#' rownames(pedInformation) <- pedInformation$Name.ID
|
be6ebeca |
#'
|
045e8817 |
#' ## Add information about 2 samples to the GDS file
#' RAIDS:::generateGDSRefSample(gdsReference=tmpGDS,
|
0fdab8b8 |
#' dfPedReference=pedInformation, listSamples=NULL)
|
be6ebeca |
#'
|
045e8817 |
#' ## Read sample identifier list
#' read.gdsn(index.gdsn(node=tmpGDS, path="sample.id"))
|
be6ebeca |
#'
|
045e8817 |
#' ## Read sample information from GDS file
#' read.gdsn(index.gdsn(node=tmpGDS, path="sample.annot"))
|
be6ebeca |
#'
|
045e8817 |
#' ## Close GDS file
#' closefn.gds(gdsfile=tmpGDS)
|
0fdab8b8 |
#'
|
045e8817 |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
0fdab8b8 |
#'
|
be6ebeca |
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn
#' @encoding UTF-8
#' @keywords internal
|
32e75790 |
generateGDSRefSample <- function(gdsReference, dfPedReference,
listSamples=NULL) {
|
be6ebeca |
if(!(is.null(listSamples))){
|
b6cc5ca4 |
dfPedReference <- dfPedReference[listSamples,]
|
be6ebeca |
}
|
32e75790 |
add.gdsn(node=gdsReference, name="sample.id",
val=dfPedReference[, "Name.ID"])
|
be6ebeca |
## Create a data.frame containing the information form the samples
|
b6cc5ca4 |
samp.annot <- data.frame(sex=dfPedReference[, "sex"],
|
32e75790 |
pop.group=dfPedReference[, "pop.group"],
superPop=dfPedReference[, "superPop"],
|
b6cc5ca4 |
batch=dfPedReference[, "batch"], stringsAsFactors=FALSE)
|
be6ebeca |
## Add the data.frame to the GDS object
|
48837112 |
add.gdsn(node=gdsReference, name="sample.annot", val=samp.annot)
|
be6ebeca |
## Return the vector of saved samples
|
b6cc5ca4 |
return(dfPedReference[, "Name.ID"])
|
be6ebeca |
}
#' @title Create a "sample.ref" node i a GDS file with the information about
#' the related/unrelated state of the reference samples
#'
#' @description This function creates a "sample.ref" node in the GDS file.
#' The node contains a vector of integers with value of 1 when
#' the samples are used as references and 0 otherwise.
#' The information used to fill the "sample.ref" node comes from the RDS file
#' that contains the information about the unrelated reference samples.
#'
|
aa3f0a99 |
#' @param gdsReference an object of class
|
be6ebeca |
#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file.
#'
#' @param filePart a \code{character} string representing the path and file
#' name of a RDS file containing the information about the related and
#' unrelated samples in the reference dataset. The RDS file must exist. The
#' RDS file must contains a \code{vector} of \code{character} strings called
#' "unrels" with the name of the unrelated samples.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
|
1c304a2a |
#' ## Required library
#' library(gdsfmt)
#'
|
3e69f2d1 |
#' ## Locate RDS with unrelated/related status for Reference samples
|
bdf3a5d8 |
#' dataDir <- system.file("extdata", package="RAIDS")
#' rdsFilePath <- file.path(dataDir, "unrelatedPatientsInfo_Demo.rds")
|
be6ebeca |
#'
|
1434071b |
#' ## Temporary GDS file
|
045e8817 |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_11.gds")
|
0fdab8b8 |
#'
|
045e8817 |
#' ## Create and open the GDS file
#' tmpGDS <- createfn.gds(filename=gdsFilePath)
|
be6ebeca |
#
|
045e8817 |
#' ## Create "sample.id" node (the node must be present)
#' sampleIDs <- c("HG00104", "HG00109", "HG00110")
#' add.gdsn(node=tmpGDS, name="sample.id", val=sampleIDs)
|
0fdab8b8 |
#'
|
045e8817 |
#' ## Create "sample.ref" node in GDS file using RDS information
#' RAIDS:::addGDSRef(gdsReference=tmpGDS, filePart=rdsFilePath)
|
be6ebeca |
#'
|
045e8817 |
#' ## Read sample reference data.frame
#' read.gdsn(index.gdsn(node=tmpGDS, path="sample.ref"))
|
be6ebeca |
#'
|
045e8817 |
#' ## Close GDS file
#' closefn.gds(gdsfile=tmpGDS)
|
be6ebeca |
#'
|
045e8817 |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
be6ebeca |
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn
#' @encoding UTF-8
#' @keywords internal
|
aa3f0a99 |
addGDSRef <- function(gdsReference, filePart) {
|
be6ebeca |
part <- readRDS(filePart)
|
aa3f0a99 |
sampleGDS <- index.gdsn(gdsReference, "sample.id")
|
be6ebeca |
df <- data.frame(sample.id=read.gdsn(sampleGDS),
sample.ref=0, stringsAsFactors=FALSE)
# The order of part$unrels is not the same than df$sample.id
df[df$sample.id %in% part$unrels, "sample.ref"] <- 1
|
aa3f0a99 |
add.gdsn(gdsReference, "sample.ref", df$sample.ref, storage="bit1")
|
be6ebeca |
## Success
return(0L)
}
|
dd76ad8e |
#' @title Appends the genotype information for specific samples
#' (1 column == 1 profile) into a GDS file
|
be6ebeca |
#'
#' @description This function appends the genotype information into a
#' GDS file. More specifically, the genotype information is added to the
#' "genotype" node. The "genotype" node must already be present in the
#' GDS file. The genotype information is a matrix with the rows corresponding
#' to SNVs and columns corresponding to samples.
#' The number of rows of the new genotype information must
#' correspond to the number of rows of the matrix present in the
#' "genotype" node.
#'
#' @param gds an object of class
|
dd76ad8e |
#' \link[gdsfmt]{gds.class} (a GDS file), the opened Profile GDS file.
|
be6ebeca |
#'
#' @param matG a \code{matrix} of \code{integer} representing the genotypes
#' of the SNVs for one or multiple samples. The rows correspond to SNVs and
#' the columns correspond to samples. The number of rows must
#' correspond to the number of rows of the matrix present in the
#' "genotype" node.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
|
1c304a2a |
#' ## Required library
#' library(gdsfmt)
#'
|
1434071b |
#' ## Create a temporary GDS file
|
5f024f9e |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_06.gds")
|
be6ebeca |
#'
|
5f024f9e |
#' ## Create and open the GDS file
#' tmpGDS <- createfn.gds(filename=gdsFilePath)
|
be6ebeca |
#'
|
5f024f9e |
#' ## Create a "genotype" node with initial matrix
#' genoInitial <- matrix(rep(0L, 10), nrow=2)
|
be6ebeca |
#'
|
5f024f9e |
#' add.gdsn(node=tmpGDS, name="genotype", val=genoInitial)
#' sync.gds(tmpGDS)
|
be6ebeca |
#'
|
5f024f9e |
#' ## New genotype information to be added
#' newGenotype <- matrix(rep(1L, 6), nrow=2)
|
be6ebeca |
#'
|
5f024f9e |
#' ## Add segments to the GDS file
#' RAIDS:::appendGDSgenotypeMat(gds=tmpGDS, matG=newGenotype)
|
be6ebeca |
#'
|
5f024f9e |
#' ## Read genotype information from GDS file
#' ## The return matrix should be a combination of both initial matrix
#' ## and new matrix (column binded)
#' read.gdsn(index.gdsn(node=tmpGDS, path="genotype"))
|
be6ebeca |
#'
|
5f024f9e |
#' ## Close GDS file
#' closefn.gds(gdsfile=tmpGDS)
|
1434071b |
#'
|
5f024f9e |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
1434071b |
#'
|
be6ebeca |
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn
#' @importFrom utils read.csv2
#' @encoding UTF-8
#' @keywords internal
appendGDSgenotypeMat <- function(gds, matG) {
geno.var <- index.gdsn(gds, "genotype")
append.gdsn(node=geno.var, val=matG, check=TRUE)
return(0L)
}
|
e8cb5540 |
#' @title Append the genotype information from a profile into the associated
|
9609f383 |
#' Profile GDS File
|
be6ebeca |
#'
|
e8cb5540 |
#' @description This function append the genotype information from a specific
#' profile into the Profile GDS file. The genotype information is extracted
#' from a SNV file as generated by SNP-pileup or other tools.
|
be6ebeca |
#'
|
87c14221 |
#' @param pathGeno a \code{character} string representing the path to the
#' directory containing the VCF output of SNP-pileup for each sample. The
#' SNP-pileup files must be compressed (gz files) and have the name identifiers
#' of the samples. A sample with "Name.ID" identifier would have an
#' associated file called
#' if genoSource is "VCF", then "Name.ID.vcf.gz",
#' if genoSource is "generic", then "Name.ID.generic.txt.gz"
#' if genoSource is "snp-pileup", then "Name.ID.txt.gz".
|
be6ebeca |
#'
#' @param listSamples a \code{vector} of \code{character} string corresponding
|
dd76ad8e |
#' to the sample identifiers that will have a Profile GDS file created. The
|
be6ebeca |
#' sample identifiers must be present in the "Name.ID" column of the
|
b6cc5ca4 |
#' \code{data.frame} passed to the \code{dfPedProfile} parameter.
|
be6ebeca |
#'
|
e8cb5540 |
#' @param listPos a \code{data.frame} containing 2 columns. The first column,
#' called "snp.chromosome" contains the name of the chromosome where the
#' SNV is located. The second column, called "snp.position" contains the
#' position of the SNV on the chromosome.
|
be6ebeca |
#'
|
e8cb5540 |
#' @param offset a \code{integer} to adjust if the genome start at 0 or 1.
|
be6ebeca |
#'
|
e8cb5540 |
#' @param minCov a single positive \code{integer} representing the minimum
#' coverage needed to keep the SNVs in the analysis. Default: \code{10}.
|
be6ebeca |
#'
|
e8cb5540 |
#' @param minProb a single positive \code{numeric} between 0 and 1
#' representing the probability that the base change at the SNV position
#' is not an error.
#' Default: \code{0.999}.
|
be6ebeca |
#'
#' @param seqError a single positive \code{numeric} between 0 and 1
#' representing the sequencing error rate. Default: \code{0.001}.
#'
|
32e75790 |
#' @param dfPedProfile a \code{data.frame} with the information about
#' the sample(s).
|
be6ebeca |
#' Those are mandatory columns: "Name.ID",
|
040f95da |
#' "Case.ID", "Sample.Type", "Diagnosis" and "Source". All columns must be in
#' \code{character} strings format. The \code{data.frame}
|
be6ebeca |
#' must contain the information for all the samples passed in the
#' \code{listSamples} parameter.
#'
#' @param batch a single positive \code{integer} representing the current
#' identifier for the batch. Beware, this field is not stored anymore.
#'
#' @param studyDF a \code{data.frame} containing the information about the
#' study associated to the analysed sample(s). The \code{data.frame} must have
#' those 3 columns: "study.id", "study.desc", "study.platform". All columns
#' must be in \code{character} strings.
#'
|
e8cb5540 |
#' @param pathProfileGDS a \code{character} string representing the path to
#' the directory where the GDS Sample files will be created.
|
ebecc683 |
#'
|
e8cb5540 |
#' @param genoSource a \code{character} string with two possible values:
|
9ed5a917 |
#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files
|
e8cb5540 |
#' are generated by snp-pileup (Facets) or are a generic format CSV file
#' with at least those columns:
#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
#' The 'Count' is the depth at the specified position;
#' 'FileR' is the depth of the reference allele and
#' 'File1A' is the depth of the specific alternative allele.
|
9ed5a917 |
#' Finally the file can be a VCF file with at least those genotype
|
279ad515 |
#' fields: GT, AD, DP.
|
ebecc683 |
#'
|
be6ebeca |
#' @param verbose a \code{logical} indicating if the function must print
#' messages when running.
#'
#' @return The function returns \code{0L} when successful.
#'
#' @examples
#'
|
7b9d4130 |
#' ## Current directory
|
5f024f9e |
#' dataDir <- file.path(tempdir())
|
7b9d4130 |
#'
|
5f024f9e |
#' ## Copy required file into current directory
#' file.copy(from=file.path(system.file("extdata/tests", package="RAIDS"),
|
7b9d4130 |
#' "ex1.txt.gz"), to=dataDir)
#'
|
5f024f9e |
#' ## The data.frame containing the information about the study
#' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform"
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' studyDF <- data.frame(study.id = "MYDATA",
|
e8cb5540 |
#' study.desc = "Description",
#' study.platform = "PLATFORM",
#' stringsAsFactors = FALSE)
#'
|
5f024f9e |
#' ## The data.frame containing the information about the samples
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' samplePED <- data.frame(Name.ID=c("ex1", "ex2"),
|
e8cb5540 |
#' Case.ID=c("Patient_h11", "Patient_h12"),
#' Diagnosis=rep("Cancer", 2),
#' Sample.Type=rep("Primary Tumor", 2),
#' Source=rep("Databank B", 2), stringsAsFactors=FALSE)
|
5f024f9e |
#' rownames(samplePED) <- samplePED$Name.ID
|
e8cb5540 |
#'
|
5f024f9e |
#' ## List of SNV positions
#' listPositions <- data.frame(snp.chromosome=c(rep(1, 10)),
|
7b9d4130 |
#' snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527,
#' 3469737, 3471497, 3471565, 3471618))
|
e8cb5540 |
#'
|
5f024f9e |
#' ## Append genotype information to the Profile GDS file
#' result <- RAIDS:::generateGDS1KGgenotypeFromSNPPileup(pathGeno=dataDir,
|
e8cb5540 |
#' listSamples=c("ex1"), listPos=listPositions,
#' offset=-1, minCov=10, minProb=0.999, seqError=0.001,
|
b6cc5ca4 |
#' dfPedProfile=samplePED, batch=1, studyDF=studyDF,
|
e8cb5540 |
#' pathProfileGDS=dataDir, genoSource="snp-pileup",
#' verbose=FALSE)
#'
|
5f024f9e |
#' ## The function returns OL when successful
#' result
|
7b9d4130 |
#'
|
5f024f9e |
#' ## The Profile GDS file 'ex1.gds' has been created in the
#' ## specified directory
#' list.files(dataDir)
|
e8cb5540 |
#'
|
5f024f9e |
#' ## Unlink Profile GDS file (created for demo purpose)
#' unlink(file.path(dataDir, "ex1.gds"))
#' unlink(file.path(dataDir, "ex1.txt.gz"))
|
e8cb5540 |
#'
|
be6ebeca |
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn write.gdsn openfn.gds
#' @importFrom stats qbinom
#' @importFrom utils read.csv
#' @encoding UTF-8
#' @keywords internal
generateGDS1KGgenotypeFromSNPPileup <- function(pathGeno,
listSamples, listPos, offset, minCov=10, minProb=0.999,
|
b6cc5ca4 |
seqError=0.001, dfPedProfile, batch, studyDF, pathProfileGDS,
|
e8cb5540 |
genoSource, verbose) {
|
be6ebeca |
# File with the description of the SNP keep
|
279ad515 |
if(genoSource == "VCF"){
listMat <- dir(pathGeno, pattern = ".+.vcf.gz")
listSampleFile <- gsub(".vcf.gz", "", listMat)
}else{
listMat <- dir(pathGeno, pattern = ".+.txt.gz")
listSampleFile <- gsub(".txt.gz", "", listMat)
}
|
ebecc683 |
|
be6ebeca |
g <- as.matrix(rep(-1, nrow(listPos)))
for(i in seq_len(length(listSamples))) {
pos <- which(listSampleFile == listSamples[i])
if(verbose) { message(listSamples[i], " ", Sys.time(), " ", i) }
if(length(pos) == 1) {
|
a597d143 |
#
|
ebecc683 |
if(genoSource == "snp-pileup") {
matSample <- readSNVPileupFile(file.path(pathGeno,
listMat[pos]), offset)
|
10c6487c |
} else if(genoSource == "generic") {
|
ebecc683 |
matSample <- readSNVFileGeneric(file.path(pathGeno,
listMat[pos]), offset)
|
279ad515 |
} else if(genoSource == "VCF") {
tmpProfile <- gsub(".vcf.gz", "",listMat[pos])
matSample <- readSNVVCF(file.path(pathGeno,
listMat[pos]),
profileName=tmpProfile, offset)
|
a597d143 |
}
|
be6ebeca |
# matAll <- merge(matSample[,c( "Chromosome", "Position",
# "File1R", "File1A",
# "count" )],
# listPos,
# by.x = c("Chromosome", "Position"),
# by.y = c("snp.chromosome", "snp.position"),
# all.y = TRUE,
# all.x = FALSE)
#
# below same as the merge above but faster
z <- cbind(c(listPos$snp.chromosome, matSample$Chromosome,
matSample$Chromosome),
c(listPos$snp.position, matSample$Position,
matSample$Position),
c(rep(1,nrow(listPos)), rep(0,nrow(matSample)),
rep(2,nrow(matSample))),
c(rep(0,nrow(listPos)), matSample[, "File1R"],
-1 * matSample[, "File1R"]),
c(rep(0,nrow(listPos)), matSample[, "File1A"],
-1 * matSample[, "File1A"]),
c(rep(0,nrow(listPos)), matSample[, "count"],
-1 * matSample[, "count"]))
rm(matSample)
z <- z[order(z[,1], z[,2], z[,3]),]
matAll <- data.frame(Chromosome=z[z[, 3] == 1, 1],
Position=z[z[, 3] == 1, 2], File1R=cumsum(z[, 4])[z[, 3] == 1],
File1A=cumsum(z[,5])[z[, 3] == 1],
count=cumsum(z[, 6])[z[, 3] == 1])
rm(z)
|
e8cb5540 |
if(is.null(pathProfileGDS)){
stop("pathProfileGDS is NULL in ",
|
be6ebeca |
"generateGDS1KGgenotypeFromSNPPileup\n")
} else{
|
e8cb5540 |
if(! dir.exists(pathProfileGDS)) {
dir.create(pathProfileGDS)
|
be6ebeca |
}
}
|
e8cb5540 |
fileGDSSample <- file.path(pathProfileGDS,
|
be6ebeca |
paste0(listSamples[i], ".gds"))
if(file.exists(fileGDSSample)) {
gdsSample <- openfn.gds(fileGDSSample, readonly=FALSE)
} else{
gdsSample <- createfn.gds(fileGDSSample)
}
if (! "Ref.count" %in% ls.gdsn(gdsSample)) {
var.Ref <- add.gdsn(gdsSample, "Ref.count", matAll$File1R,
valdim=c( nrow(listPos), 1),
storage="sp.int16")
var.Alt <- add.gdsn(gdsSample, "Alt.count", matAll$File1A,
valdim=c( nrow(listPos), 1),
storage="sp.int16")
var.Count <- add.gdsn(gdsSample, "Total.count",
matAll$count, valdim=c( nrow(listPos), 1),
storage="sp.int16")
} else {
# you must append
var.Ref <- append.gdsn(index.gdsn(gdsSample, "Ref.count"),
matAll$File1R)
var.Alt <- append.gdsn(index.gdsn(gdsSample, "Alt.count"),
matAll$File1A)
var.Count <- append.gdsn(index.gdsn(gdsSample, "Total.count"),
matAll$count)
}
|
3104ba2c |
listSampleGDS <- addStudyGDSSample(gdsSample,
pedProfile=dfPedProfile, batch=batch,
listSamples=c(listSamples[i]), studyDF=studyDF, verbose=verbose)
|
be6ebeca |
listCount <- table(matAll$count[matAll$count >= minCov])
cutOffA <-
data.frame(count=unlist(vapply(as.integer(names(listCount)),
FUN=function(x, minProb, eProb){
|
e8cb5540 |
return(max(2,qbinom(minProb, x, eProb))) },
|
be6ebeca |
FUN.VALUE=numeric(1), minProb=minProb, eProb=2 * seqError)),
allele=unlist(vapply(as.integer(names(listCount)),
FUN=function(x, minProb, eProb){
|
e8cb5540 |
return(max(2,qbinom(minProb, x, eProb))) },
|
be6ebeca |
FUN.VALUE=numeric(1), minProb=minProb, eProb=seqError)))
row.names(cutOffA) <- names(listCount)
# Initialize the genotype array at -1
# Select the position where the coverage of the 2 alleles is enough
listCov <- which(rowSums(matAll[, c("File1R", "File1A")]) >= minCov)
matAllC <- matAll[listCov,]
# The difference depth - (nb Ref + nb Alt) can be realistically
# explain by sequencing error
listCov <- listCov[(matAllC$count -
(matAllC$File1R + matAllC$File1A)) <
cutOffA[as.character(matAllC$count), "count"]]
matAllC <- matAll[listCov,]
rm(matAll)
g <- as.matrix(rep(-1, nrow(listPos)))
# The sample is homozygote if the other known allele have a
# coverage of 0
g[listCov][which(matAllC$File1A == 0)] <- 0
g[listCov][which(matAllC$File1R == 0)] <- 2
# The sample is heterozygote if explain the coverage of
# the minor allele by sequencing error is not realistic.
g[listCov][which(matAllC$File1A >=
cutOffA[as.character(matAllC$count), "allele"] &
matAllC$File1R >= cutOffA[as.character(matAllC$count),
"allele"])] <- 1
#g <- as.matrix(g)
if("geno.ref" %in% ls.gdsn(gdsSample)){
var.geno <- index.gdsn(gdsSample, "geno.ref")
compression.gdsn(var.geno, compress="")
|
e8cb5540 |
append.gdsn(var.geno, g, check=TRUE)
|
be6ebeca |
compression.gdsn(var.geno, compress="LZMA_RA.fast")
readmode.gdsn(var.geno)
}else{
var.geno <- add.gdsn(gdsSample, "geno.ref",
valdim=c(length(g), 1),
g, storage="bit2", compress = "LZMA_RA.fast")
readmode.gdsn(var.geno)
}
rm(g)
closefn.gds(gdsfile=gdsSample)
if (verbose) {
message(listMat[pos], " ", i, " ", Sys.time())
}
} else {
stop("Missing samples ", listSamples[i])
}
}
## Success
return(0L)
}
|
dd2ee042 |
#' @title Append the genotype information from a profile into the associated
#' Profile GDS File
#'
#' @description This function append the genotype information from a specific
#' profile into the Profile GDS file. The genotype information is extracted
#' from a SNV file as generated by SNP-pileup or other tools.
#'
#' @param profileFile a \code{character} string representing the path and the
#' file name of the genotype file or the bam if genoSource is snp-pileup the
#' fine extension must be .txt.gz, if VCF the extension must be .vcf.gz
#'
#' @param profileName a \code{character} string representing the profileName
#'
#' @param listPos a \code{data.frame} containing 2 columns. The first column,
#' called "snp.chromosome" contains the name of the chromosome where the
#' SNV is located. The second column, called "snp.position" contains the
#' position of the SNV on the chromosome.
#'
#' @param offset a \code{integer} to adjust if the genome start at 0 or 1.
#'
#' @param minCov a single positive \code{integer} representing the minimum
#' coverage needed to keep the SNVs in the analysis. Default: \code{10}.
#'
#' @param minProb a single positive \code{numeric} between 0 and 1
#' representing the probability that the base change at the SNV position
#' is not an error.
#' Default: \code{0.999}.
#'
#' @param seqError a single positive \code{numeric} between 0 and 1
#' representing the sequencing error rate. Default: \code{0.001}.
#'
#' @param dfPedProfile a \code{data.frame} with the information about
#' the sample(s).
#' Those are mandatory columns: "Name.ID",
#' "Case.ID", "Sample.Type", "Diagnosis" and "Source". All columns must be in
#' \code{character} strings format. The \code{data.frame}
#' must contain the information for all the samples passed in the
#' \code{listSamples} parameter.
#'
#' @param batch a single positive \code{integer} representing the current
#' identifier for the batch. Beware, this field is not stored anymore.
#'
#' @param studyDF a \code{data.frame} containing the information about the
#' study associated to the analysed sample(s). The \code{data.frame} must have
#' those 3 columns: "study.id", "study.desc", "study.platform". All columns
#' must be in \code{character} strings.
#'
#' @param pathProfileGDS a \code{character} string representing the path to
#' the directory where the GDS Sample files will be created.
#'
#' @param genoSource a \code{character} string with two possible values:
#' 'snp-pileup', 'generic' or 'VCF'. It specifies if the genotype files
#' are generated by snp-pileup (Facets) or are a generic format CSV file
#' with at least those columns:
#' 'Chromosome', 'Position', 'Ref', 'Alt', 'Count', 'File1R' and 'File1A'.
#' The 'Count' is the depth at the specified position;
#' 'FileR' is the depth of the reference allele and
#' 'File1A' is the depth of the specific alternative allele.
#' Finally the file can be a VCF file with at least those genotype
#' fields: GT, AD, DP.
#'
|
9969cfc7 |
#' @param paramProfileGDS a \code{list} parameters ...
#'
|
dd2ee042 |
#' @param verbose a \code{logical} indicating if the function must print
#' messages when running.
#'
#' @return The function returns \code{0L} when successful.
#'
#' @examples
#'
#' ## Current directory
#' dataDir <- file.path(tempdir())
#'
#' ## Copy required file into current directory
#' file.copy(from=file.path(system.file("extdata/tests", package="RAIDS"),
#' "ex1.txt.gz"), to=dataDir)
#'
#' ## The data.frame containing the information about the study
#' ## The 3 mandatory columns: "study.id", "study.desc", "study.platform"
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' studyDF <- data.frame(study.id = "MYDATA",
#' study.desc = "Description",
#' study.platform = "PLATFORM",
#' stringsAsFactors = FALSE)
#'
#' ## The data.frame containing the information about the samples
#' ## The entries should be strings, not factors (stringsAsFactors=FALSE)
#' samplePED <- data.frame(Name.ID=c("ex1", "ex2"),
#' Case.ID=c("Patient_h11", "Patient_h12"),
#' Diagnosis=rep("Cancer", 2),
#' Sample.Type=rep("Primary Tumor", 2),
#' Source=rep("Databank B", 2), stringsAsFactors=FALSE)
#' rownames(samplePED) <- samplePED$Name.ID
#'
#' ## List of SNV positions
#' listPositions <- data.frame(snp.chromosome=c(rep(1, 10)),
#' snp.position=c(3467333, 3467428, 3469375, 3469387, 3469502, 3469527,
#' 3469737, 3471497, 3471565, 3471618))
#'
#' ## Append genotype information to the Profile GDS file
#' result <- RAIDS:::generateProfileGDS(profileFile=file.path(dataDir, "ex1.txt.gz"),
#' profileName="ex1", listPos=listPositions,
#' offset=-1, minCov=10, minProb=0.999, seqError=0.001,
#' dfPedProfile=samplePED, batch=1, studyDF=studyDF,
#' pathProfileGDS=dataDir, genoSource="snp-pileup",
#' verbose=FALSE)
#'
#' ## The function returns OL when successful
#' result
#'
#' ## The Profile GDS file 'ex1.gds' has been created in the
#' ## specified directory
#' list.files(dataDir)
#'
#' ## Unlink Profile GDS file (created for demo purpose)
#' unlink(file.path(dataDir, "ex1.gds"))
#' unlink(file.path(dataDir, "ex1.txt.gz"))
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn write.gdsn openfn.gds
#' @importFrom stats qbinom
#' @importFrom utils read.csv
#' @encoding UTF-8
#' @keywords internal
generateProfileGDS <- function(profileFile, profileName,
listPos, offset, minCov=10, minProb=0.999,
seqError=0.001, dfPedProfile, batch, studyDF, pathProfileGDS,
|
9969cfc7 |
genoSource, paramProfileGDS, verbose) {
|
dd2ee042 |
# File with the description of the SNP keep
# if(genoSource == "VCF"){
# listMat <- dir(pathGeno, pattern = ".+.vcf.gz")
# listSampleFile <- gsub(".vcf.gz", "", listMat)
# }else{
# listMat <- dir(pathGeno, pattern = ".+.txt.gz")
# listSampleFile <- gsub(".txt.gz", "", listMat)
# }
|
2eb8e602 |
|
dd2ee042 |
# for(i in seq_len(length(listSamples))) {
#pos <- which(listSampleFile == listSamples[i])
if(verbose) { message("generateProfileGDS start ", " ", Sys.time()) }
# if(length(pos) == 1) {
#
if(genoSource == "snp-pileup") {
matSample <- readSNVPileupFile(profileFile, offset)
} else if(genoSource == "generic") {
matSample <- readSNVFileGeneric(profileFile, offset)
} else if(genoSource == "VCF") {
# tmpProfile <- gsub(".vcf.gz", "",listMat[pos])
matSample <- readSNVVCF(profileFile,
profileName=profileName, offset)
} else if(genoSource == "bam"){
|
2eb8e602 |
|
9969cfc7 |
matSample <- readSNVBAM(fileName=profileFile,
varSelected=listPos,
paramSNVBAM=paramProfileGDS,
|
2eb8e602 |
offset,
|
9969cfc7 |
verbose=verbose)
|
3ab308bd |
# listPos <- do.call(rbind, listPos)
|
9969cfc7 |
colnames(listPos)[1:2] <- c("snp.chromosome", "snp.position")
|
2eb8e602 |
|
dd2ee042 |
}
# matAll <- merge(matSample[,c( "Chromosome", "Position",
# "File1R", "File1A",
# "count" )],
# listPos,
# by.x = c("Chromosome", "Position"),
# by.y = c("snp.chromosome", "snp.position"),
# all.y = TRUE,
# all.x = FALSE)
#
# below same as the merge above but faster
|
2eb8e602 |
if(verbose) {message("End read ", Sys.time())}
g <- as.matrix(rep(-1, nrow(listPos)))
|
dd2ee042 |
z <- cbind(c(listPos$snp.chromosome, matSample$Chromosome,
matSample$Chromosome),
c(listPos$snp.position, matSample$Position,
matSample$Position),
c(rep(1,nrow(listPos)), rep(0,nrow(matSample)),
rep(2,nrow(matSample))),
c(rep(0,nrow(listPos)), matSample[, "File1R"],
-1 * matSample[, "File1R"]),
c(rep(0,nrow(listPos)), matSample[, "File1A"],
-1 * matSample[, "File1A"]),
c(rep(0,nrow(listPos)), matSample[, "count"],
-1 * matSample[, "count"]))
rm(matSample)
z <- z[order(z[,1], z[,2], z[,3]),]
matAll <- data.frame(Chromosome=z[z[, 3] == 1, 1],
Position=z[z[, 3] == 1, 2], File1R=cumsum(z[, 4])[z[, 3] == 1],
File1A=cumsum(z[,5])[z[, 3] == 1],
count=cumsum(z[, 6])[z[, 3] == 1])
rm(z)
if(is.null(pathProfileGDS)){
stop("pathProfileGDS is NULL in ",
"generateGDS1KGgenotypeFromSNPPileup\n")
} else{
if(! dir.exists(pathProfileGDS)) {
dir.create(pathProfileGDS)
}
}
fileGDSSample <- file.path(pathProfileGDS,
paste0(profileName, ".gds"))
if(file.exists(fileGDSSample)) {
gdsSample <- openfn.gds(fileGDSSample, readonly=FALSE)
} else{
gdsSample <- createfn.gds(fileGDSSample)
}
if (! "Ref.count" %in% ls.gdsn(gdsSample)) {
var.Ref <- add.gdsn(gdsSample, "Ref.count", matAll$File1R,
valdim=c( nrow(listPos), 1),
storage="sp.int16")
var.Alt <- add.gdsn(gdsSample, "Alt.count", matAll$File1A,
valdim=c( nrow(listPos), 1),
storage="sp.int16")
var.Count <- add.gdsn(gdsSample, "Total.count",
matAll$count, valdim=c( nrow(listPos), 1),
storage="sp.int16")
} else {
# you must append
var.Ref <- append.gdsn(index.gdsn(gdsSample, "Ref.count"),
matAll$File1R)
var.Alt <- append.gdsn(index.gdsn(gdsSample, "Alt.count"),
matAll$File1A)
var.Count <- append.gdsn(index.gdsn(gdsSample, "Total.count"),
matAll$count)
}
listSampleGDS <- addStudyGDSSample(gdsSample,
pedProfile=dfPedProfile, batch=batch,
listSamples=c(profileName), studyDF=studyDF, verbose=verbose)
listCount <- table(matAll$count[matAll$count >= minCov])
cutOffA <-
data.frame(count=unlist(vapply(as.integer(names(listCount)),
FUN=function(x, minProb, eProb){
return(max(2,qbinom(minProb, x, eProb))) },
FUN.VALUE=numeric(1), minProb=minProb, eProb=2 * seqError)),
allele=unlist(vapply(as.integer(names(listCount)),
FUN=function(x, minProb, eProb){
return(max(2,qbinom(minProb, x, eProb))) },
FUN.VALUE=numeric(1), minProb=minProb, eProb=seqError)))
row.names(cutOffA) <- names(listCount)
# Initialize the genotype array at -1
# Select the position where the coverage of the 2 alleles is enough
listCov <- which(rowSums(matAll[, c("File1R", "File1A")]) >= minCov)
matAllC <- matAll[listCov,]
# The difference depth - (nb Ref + nb Alt) can be realistically
# explain by sequencing error
listCov <- listCov[(matAllC$count -
(matAllC$File1R + matAllC$File1A)) <
cutOffA[as.character(matAllC$count), "count"]]
matAllC <- matAll[listCov,]
rm(matAll)
g <- as.matrix(rep(-1, nrow(listPos)))
# The sample is homozygote if the other known allele have a
# coverage of 0
g[listCov][which(matAllC$File1A == 0)] <- 0
g[listCov][which(matAllC$File1R == 0)] <- 2
# The sample is heterozygote if explain the coverage of
# the minor allele by sequencing error is not realistic.
g[listCov][which(matAllC$File1A >=
cutOffA[as.character(matAllC$count), "allele"] &
matAllC$File1R >= cutOffA[as.character(matAllC$count),
"allele"])] <- 1
#g <- as.matrix(g)
if("geno.ref" %in% ls.gdsn(gdsSample)){
var.geno <- index.gdsn(gdsSample, "geno.ref")
compression.gdsn(var.geno, compress="")
append.gdsn(var.geno, g, check=TRUE)
compression.gdsn(var.geno, compress="LZMA_RA.fast")
readmode.gdsn(var.geno)
}else{
var.geno <- add.gdsn(gdsSample, "geno.ref",
valdim=c(length(g), 1),
g, storage="bit2", compress = "LZMA_RA.fast")
readmode.gdsn(var.geno)
}
rm(g)
closefn.gds(gdsfile=gdsSample)
if (verbose) {
message("End ", profileName, " ", Sys.time())
}
## Success
return(0L)
}
|
be6ebeca |
|
a3afa570 |
#' @title Add information related to a specific study and specific samples
#' into a GDS Sample file
#'
#' @description This function add entries related to 1) a specific study and
#' 2) specific samples into a GDS Sample file.
#' The study information is appended to the GDS Sample file "study.list" node
#' when the node is already present in the file. Otherwise, the node is
#' created and then, the information is added.
#' The sample information for all selected samples is appended to the GDS
#' Sample file "study.annot" node
#' when the node is already present in the file. Otherwise, the node is
#' created and then, the information is added.
#'
|
aa3f0a99 |
#' @param gdsProfile an object of class
|
a3afa570 |
#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file.
#'
|
3104ba2c |
#' @param pedProfile a \code{data.frame} with the sample information. The
|
a3afa570 |
#' \code{data.frame} must have the columns:
#' "Name.ID", "Case.ID", "Sample.Type", "Diagnosis" and "Source".
#' The unique sample identifier of the \code{data.frame} is the "Name.ID"
#' column and the row names of the \code{data.frame} must be the "Name.ID"
#' values.
#'
#' @param batch a \code{integer} corresponding the batch associated to the
#' study.
#'
#' @param listSamples a \code{vector} of \code{character} string representing
#' the samples (samples identifiers) that are saved into the GDS. All the
#' samples must be present in the 'pdeDF' \code{data.frame}.
|
b6cc5ca4 |
#' If \code{NULL}, all samples present in the \code{dfPedProfile} are used.
|
a3afa570 |
#'
#' @param studyDF a \code{data.frame} with at least the 3 columns: "study.id",
#' "study.desc" and "study.platform". The three columns are in character
#' string format (no factor).
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function.
#'
#' @return a \code{vector} of \code{character} strings representing the sample
#' identifiers that have been saved in the GDS Sample file.
#'
#' @examples
#'
|
1c304a2a |
#' ## Required library
#' library(gdsfmt)
#'
|
1434071b |
#' ## Create a temporary GDS file in an current directory
|
5f024f9e |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_11.gds")
|
a3afa570 |
#'
|
5f024f9e |
#' ## Create and open the GDS file
#' tmpGDS <- createfn.gds(filename=gdsFilePath)
|
a3afa570 |
#'
|
5f024f9e |
#' ## Create a PED data frame with sample information
#' ped1KG <- data.frame(Name.ID=c("1KG_sample_01", "1KG_sample_02"),
|
7b9d4130 |
#' Case.ID=c("1KG_sample_01", "1KG_sample_02"),
#' Sample.Type=rep("Reference", 2), Diagnosis=rep("Reference", 2),
#' Source=rep("IGSR", 2), stringsAsFactors=FALSE)
|
a3afa570 |
#'
|
5f024f9e |
#' ## Create a Study data frame with information about the study
#' ## All samples are associated to the same study
#' studyInfo <- data.frame(study.id="Ref.1KG",
|
7b9d4130 |
#' study.desc="Unrelated samples from 1000 Genomes",
#' study.platform="GRCh38 1000 genotypes",
#' stringsAsFactors=FALSE)
|
a3afa570 |
#'
|
5f024f9e |
#' ## Add the sample information to the GDS Sample file
#' ## The information for all samples is added (listSamples=NULL)
#' RAIDS:::addStudyGDSSample(gdsProfile=tmpGDS, pedProfile=ped1KG, batch=1,
|
7b9d4130 |
#' listSamples=NULL, studyDF=studyInfo, verbose=FALSE)
|
a3afa570 |
#'
|
5f024f9e |
#' ## Read study information from GDS Sample file
#' read.gdsn(index.gdsn(node=tmpGDS, path="study.list"))
|
a3afa570 |
#'
|
5f024f9e |
#' ## Read sample information from GDS Sample file
#' read.gdsn(index.gdsn(node=tmpGDS, path="study.annot"))
|
7b9d4130 |
#'
|
5f024f9e |
#' ## Close GDS file
#' closefn.gds(gdsfile=tmpGDS)
|
7b9d4130 |
#'
|
5f024f9e |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
7b9d4130 |
#'
|
a3afa570 |
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn append.gdsn
#' @encoding UTF-8
#' @keywords internal
|
3104ba2c |
addStudyGDSSample <- function(gdsProfile, pedProfile, batch, listSamples,
|
16faec6d |
studyDF, verbose) {
|
a3afa570 |
## Used only the selected samples (all when listSamples == NULL)
if(!(is.null(listSamples))) {
if(length(listSamples) == length(intersect(listSamples,
|
d135565e |
pedProfile$Name.ID))) {
|
79712a42 |
# if we remove the names we should manage the listSamples order
|
d135565e |
# something like
|
79712a42 |
tmp <- order(as.character(listSamples))
pedProfile <- pedProfile[which(pedProfile$Name.ID %in% listSamples), ]
pedProfile <- pedProfile[order(pedProfile$Name.ID), ][order(tmp),]
|
a3afa570 |
} else {
stop("List of samples includes samples not present in ",
|
3104ba2c |
"the \'pedProfile\' data frame. The sample names must be ",
|
16faec6d |
"present in the \'Name.ID\' column. The row names should",
" be assigned the \'Name.ID\'.")
|
a3afa570 |
}
} else {
|
3104ba2c |
listSamples <- pedProfile$Name.ID
|
a3afa570 |
}
## Create the study data frame that is going to be saved
|
b3a84582 |
df <- data.frame(study.id=studyDF$study.id, study.desc=studyDF$study.desc,
study.platform=studyDF$study.platform, stringsAsFactors=FALSE)
|
a3afa570 |
## Append study information to "study.list" when node already present
## Otherwise, create node and add study information into it
|
aa3f0a99 |
if(! "study.list" %in% ls.gdsn(gdsProfile)) {
|
a3afa570 |
## Create study node and add study information into GDS Sample file
|
aa3f0a99 |
add.gdsn(gdsProfile, "study.list", df)
|
a3afa570 |
|
b3a84582 |
## Create data frame containing sample information and add it to GDS
|
3104ba2c |
study.annot <- data.frame(data.id=pedProfile[, "Name.ID"],
case.id=pedProfile[, "Case.ID"],
sample.type=pedProfile[, "Sample.Type"],
diagnosis=pedProfile[, "Diagnosis"],
source=pedProfile[, "Source"],
study.id=rep(studyDF$study.id, nrow(pedProfile)),
|
16faec6d |
stringsAsFactors=FALSE)
|
aa3f0a99 |
add.gdsn(gdsProfile, "study.annot", study.annot)
|
a3afa570 |
if(verbose) { message("study.annot DONE ", Sys.time()) }
} else{
## Append study information to existing node
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.list/study.id"),
|
a3afa570 |
df$study.id, check=TRUE)
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.list/study.desc"),
|
a3afa570 |
df$study.desc, check=TRUE)
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.list/study.platform"),
|
a3afa570 |
df$study.platform, check=TRUE)
## Create data frame containing sample information
|
3104ba2c |
study.annot <- data.frame(data.id=pedProfile[, "Name.ID"],
case.id=pedProfile[, "Case.ID"],
sample.type=pedProfile[, "Sample.Type"],
diagnosis=pedProfile[, "Diagnosis"],
source=pedProfile[, "Source"],
study.id=rep(studyDF$study.id, nrow(pedProfile)),
|
16faec6d |
stringsAsFactors=FALSE)
|
a3afa570 |
## Append sample information to existing node
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.annot/data.id"),
|
a3afa570 |
study.annot$data.id, check=TRUE)
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.annot/case.id"),
|
a3afa570 |
study.annot$case.id, check=TRUE)
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.annot/sample.type"),
|
a3afa570 |
study.annot$sample.type, check=TRUE)
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.annot/diagnosis"),
|
a3afa570 |
study.annot$diagnosis, check=TRUE)
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.annot/source"),
|
a3afa570 |
study.annot$source, check=TRUE)
|
aa3f0a99 |
append.gdsn(index.gdsn(gdsProfile, "study.annot/study.id"),
|
a3afa570 |
study.annot$study.id, check=TRUE)
if(verbose) { message("study.annot DONE ", Sys.time()) }
}
|
b3a84582 |
## Return the vector of profile IDs that have been added to Profile GDS file
|
3104ba2c |
return(pedProfile[,"Name.ID"])
|
a3afa570 |
}
|
be6ebeca |
#' @title Identity-by-descent (IBD) analysis
#'
#' @description This function calculates the IDB coefficients by KING method
#' of moment using the
#' \code{\link[SNPRelate:snpgdsIBDKING]{SNPRelate::snpgdsIBDKING}}
#' function.
#'
#' @param gds an object of class
#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, an
#' opened SNP GDS file.
#'
#' @param profileID a \code{vector} of \code{character} strings representing
#' the samples to keep for the analysis. If \code{NULL}, all samples are used.
#' Default: \code{NULL}.
#'
#' @param snpID a \code{vector} of \code{character} strings representing
#' the SNPs to keep for the analysis. If \code{NULL}, all SNPs are used.
#' Default: \code{NULL}.
#'
#' @param maf a single \code{numeric} representing the threshold for the minor
#' allele frequency. Only the SNPs with ">= maf" are retained.
#' Default: \code{0.05}.
#'
#' @param verbose a \code{logical} indicating if information is shown during
#' the process in the \code{\link[SNPRelate]{snpgdsIBDKING}}() function.
#'
#' @return a \code{list} containing:
|
38d46f1e |
#' \describe{
|
be6ebeca |
#' \item{sample.id}{a \code{character} string representing the sample
#' ids used in the analysis}
#' \item{snp.id}{a \code{character} string representing the SNP ids
#' used in the analysis}
#' \item{k0}{a \code{numeric}, the IBD coefficient, the probability of
#' sharing zero IBD}
#' \item{k1}{a \code{numeric}, the IBD coefficient, the probability of
#' sharing one IBD}
#' \item{IBS0}{a \code{numeric}, the proportion of SNPs with zero IBS}
#' \item{kinship}{a \code{numeric}, the proportion of SNPs with zero IBS,
#' if the parameter kinship=TRUE}
#' }
#'
#' @examples
#'
#' ## Required
#' library(SNPRelate)
#'
#' ## Open an example dataset (HapMap)
#' genoFile <- snpgdsOpen(snpgdsExampleFileName())
#'
#' ## Extract CEU population
#' samples <- read.gdsn(index.gdsn(genoFile, "sample.id"))
#' CEU <- samples[
#' read.gdsn(index.gdsn(genoFile, "sample.annot/pop.group"))=="CEU"]
#'
#' ## Infer the presence of population stratification
#' ibd.robust <- RAIDS:::runIBDKING(gds=genoFile, profileID=CEU, snpID=NULL,
#' maf=0.05, verbose=FALSE)
#'
#' ## close the genotype file
#' snpgdsClose(genoFile)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom SNPRelate snpgdsIBDKING
#' @encoding UTF-8
#' @keywords internal
runIBDKING <- function(gds, profileID=NULL, snpID=NULL, maf=0.05, verbose) {
## Calculate IBD coefficients by KING method of moment
ibd.robust <- snpgdsIBDKING(gdsobj=gds, sample.id=profileID,
|
d899f5f8 |
snp.id=snpID, maf=maf,
type="KING-robust",
verbose=verbose)
## Return result
|
be6ebeca |
return(ibd.robust)
}
#' @title SNP pruning based on linkage disequilibrium (LD)
#'
#' @description This function is a wrapper for the
#' \code{\link[SNPRelate]{snpgdsLDpruning}}() function that generates a pruned
#' subset of SNPs that are in approximate linkage equilibrium.
#'
#' @param gds an \code{object} of class
#' \code{\link[SNPRelate]{SNPGDSFileClass}}, a SNP GDS file.
#'
#' @param method a \code{character} string that represents the method that will
#' be used to calculate the linkage disequilibrium in the
#' \code{\link[SNPRelate]{snpgdsLDpruning}}() function. The 4 possible values
|
b3a84582 |
#' are: "corr", "r", "dprime" and "composite".
|
be6ebeca |
#'
#' @param listSamples a \code{vector} of \code{character} strings
#' corresponding to the sample identifiers used in LD pruning done by the
#' \code{\link[SNPRelate]{snpgdsLDpruning}}() function. If \code{NULL}, all
#' samples are used. Default: \code{NULL}.
#'
#' @param listKeep a \code{vector} of SNVs identifiers specifying selected;
#' if \code{NULL}, all SNVs are used in the
#' \code{\link[SNPRelate]{snpgdsLDpruning}} function. Default: \code{NULL}.
#'
#' @param slideWindowMaxBP a single positive \code{integer} that represents
#' the maximum basepairs (bp) in the sliding window. This parameter is used
#' for the LD pruning done in the \code{\link[SNPRelate]{snpgdsLDpruning}}()
#' function.
#' Default: \code{500000L}.
#'
#' @param thresholdLD a single \code{numeric} value that represents the LD
#' threshold used in the \code{\link[SNPRelate]{snpgdsLDpruning}} function.
#' Default: \code{sqrt(0.1)}.
#'
#' @param np a single positive \code{integer} specifying the number of
#' threads to be used. Default: \code{1L}.
#'
#' @param verbose a \code{logical} indicating if information is shown during
#' the process in the \code{\link[SNPRelate]{snpgdsLDpruning}}() function.
#'
#' @return a \code{list} of SNP identifiers stratified by chromosomes as
#' generated by \code{\link[SNPRelate]{snpgdsLDpruning}} function.
#'
#' @details
#'
#' The SNP pruning is based on linkage disequilibrium (LD) and is done by the
#' \code{\link[SNPRelate]{snpgdsLDpruning}}() function in the
#' SNPRelate package (https://bioconductor.org/packages/SNPRelate/).
#'
#' @examples
#'
|
1434071b |
#' ## Required
#' library(SNPRelate)
#'
|
be6ebeca |
#' ## Open an example dataset (HapMap)
#' genoFile <- snpgdsOpen(snpgdsExampleFileName())
#'
#' ## Fix seed to get reproducible results
#' set.seed(1000)
#'
#' ## Get linkage Disequilibrium (LD) based SNP pruning
#' snpSet <- RAIDS:::runLDPruning(gds=genoFile, verbose=FALSE)
#' names(snpSet)
#'
#' ## Get SNP ids
#' snp.id <- unlist(unname(snpSet))
#'
#' ## Close the genotype file
#' snpgdsClose(genoFile)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#'
#' @importFrom SNPRelate snpgdsOpen snpgdsLDpruning
#' @importFrom gdsfmt closefn.gds
#' @encoding UTF-8
#' @keywords internal
|
b3a84582 |
runLDPruning <- function(gds, method,
|
be6ebeca |
listSamples=NULL, listKeep=NULL, slideWindowMaxBP=500000L,
thresholdLD=sqrt(0.1), np=1L, verbose) {
## Call SNP LD pruning
|
d899f5f8 |
result <- snpgdsLDpruning(gdsobj=gds, method="corr",
sample.id=listSamples,
snp.id=listKeep, slide.max.bp=slideWindowMaxBP,
ld.threshold=thresholdLD, num.thread=np,
verbose=verbose)
|
be6ebeca |
return(result)
}
|
3978de30 |
#' @title Append fields related to samples into a GDS file
#'
#' @description This function appends the fields related to samples into
#' a GDS file. The information is extracted from the \code{data.frame} passed
#' to the function and is added to the "sample.annot" and "sample.id" nodes.
#' The "sample.id" and "sample.annot" nodes must already exist.
#' If the samples are part of a study, the function
#' addStudyGDSSample() must be used.
#'
|
aa3f0a99 |
#' @param gdsReference an object of class
|
3978de30 |
#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file.
#'
|
32e75790 |
#' @param dfPedReference a \code{data.frame} with the information about
#' the sample(s).
|
3978de30 |
#' The \code{data.frame} must have the columns: "sample.id", "Name.ID", "sex",
#' "pop.group" and "superPop". The unique identifier for the sample(s) is
#' the "Name.ID" column and the row names of the \code{data.frame} must
#' correspond to the "Name.ID" column.
#'
#' @param batch a \code{integer} representing the batch identifier.
#'
#' @param listSamples a \code{vector} of \code{character} string with the
#' selected sample(s). If \code{NULL}, all samples are used.
#'
#' @param verbose a \code{logical} indicating if messages should be printed
#' to show how the different steps in the function. Default: \code{TRUE}.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
|
1c304a2a |
#' ## Required library
#' library(gdsfmt)
#'
|
3978de30 |
#' ## Create a temporary GDS file in an test directory
|
5f024f9e |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_03.gds")
|
3978de30 |
#'
|
5f024f9e |
#' ## Create and open the GDS file
#' tmpGDS <- createfn.gds(filename=gdsFilePath)
|
7b9d4130 |
#'
|
5f024f9e |
#' ## Create "sample.id" node (the node must be present)
#' add.gdsn(node=tmpGDS, name="sample.id", val=c("sample_01",
|
7b9d4130 |
#' "sample_02"))
#'
|
5f024f9e |
#' ## Create "sample.annot" node (the node must be present)
#' add.gdsn(node=tmpGDS, name="sample.annot", val=data.frame(
|
7b9d4130 |
#' Name.ID=c("sample_01", "sample_02"),
#' sex=c(1,1), # 1:Male 2: Female
#' pop.group=c("ACB", "ACB"),
#' superPop=c("AFR", "AFR"),
#' batch=c(1, 1),
#' stringsAsFactors=FALSE))
#'
|
5f024f9e |
#' sync.gds(gdsfile=tmpGDS)
|
7b9d4130 |
#'
|
5f024f9e |
#' ## Create a data.frame with information about samples
#' sample_info <- data.frame(Name.ID=c("sample_04", "sample_05",
|
7b9d4130 |
#' "sample_06"),
#' sex=c(1,2,1), # 1:Male 2: Female
#' pop.group=c("ACB", "ACB", "ACB"),
#' superPop=c("AFR", "AFR", "AFR"),
#' stringsAsFactors=FALSE)
#'
|
5f024f9e |
#' ## The row names must be the sample identifiers
#' rownames(sample_info) <- sample_info$Name.ID
|
7b9d4130 |
#'
|
5f024f9e |
#' ## Add information about 2 samples to the GDS file
#' RAIDS:::appendGDSRefSample(gdsReference=tmpGDS,
|
7b9d4130 |
#' dfPedReference=sample_info,
#' batch=2, listSamples=c("sample_04", "sample_06"), verbose=FALSE)
#'
|
5f024f9e |
#' ## Read sample identifier list
#' ## Only "sample_04" and "sample_06" should have been added
#' read.gdsn(index.gdsn(node=tmpGDS, path="sample.id"))
|
7b9d4130 |
#'
|
5f024f9e |
#' ## Read sample information from GDS file
#' ## Only "sample_04" and "sample_06" should have been added
#' read.gdsn(index.gdsn(node=tmpGDS, path="sample.annot"))
|
3978de30 |
#'
|
5f024f9e |
#' ## Close GDS file
#' closefn.gds(gdsfile=tmpGDS)
|
3978de30 |
#'
|
5f024f9e |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
3978de30 |
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn append.gdsn
#' @encoding UTF-8
#' @keywords internal
|
32e75790 |
appendGDSRefSample <- function(gdsReference, dfPedReference, batch=1,
listSamples=NULL, verbose=TRUE) {
|
3978de30 |
## Only keep selected samples
if(!(is.null(listSamples))){
|
b6cc5ca4 |
dfPedReference <- dfPedReference[listSamples,]
|
3978de30 |
}
## Append sample identifiers to the "sample.id" node
|
aa3f0a99 |
sampleGDS <- index.gdsn(gdsReference, "sample.id")
|
b6cc5ca4 |
append.gdsn(sampleGDS, val=dfPedReference$Name.ID, check=TRUE)
|
3978de30 |
## Create the data.frame with the sample information
|
b6cc5ca4 |
samp.annot <- data.frame(sex = dfPedReference[, "sex"],
pop.group=dfPedReference[, "pop.group"],
superPop=dfPedReference[, "superPop"],
batch=rep(batch, nrow(dfPedReference)),
|
3978de30 |
stringsAsFactors=FALSE)
if(verbose) { message("Annot") }
## Append data.frame to "sample.annot" node
|
aa3f0a99 |
curAnnot <- index.gdsn(gdsReference, "sample.annot/sex")
|
3978de30 |
append.gdsn(curAnnot, samp.annot$sex, check=TRUE)
|
aa3f0a99 |
curAnnot <- index.gdsn(gdsReference, "sample.annot/pop.group")
|
3978de30 |
append.gdsn(curAnnot, samp.annot$pop.group, check=TRUE)
|
aa3f0a99 |
curAnnot <- index.gdsn(gdsReference, "sample.annot/superPop")
|
3978de30 |
append.gdsn(curAnnot, samp.annot$superPop, check=TRUE)
|
aa3f0a99 |
curAnnot <- index.gdsn(gdsReference, "sample.annot/batch")
|
3978de30 |
append.gdsn(curAnnot, samp.annot$batch, check=TRUE)
if(verbose) { message("Annot done") }
return(0L)
}
|
b3a84582 |
#' @title Add the pruned.study entry related to the SNV dataset in the
#' Profile GDS file
#'
#' @description This function adds the names of the SNVs into the node called
#' "pruned.study" in GDS
#' Sample file. If a "pruned.study" entry is already present, the entry is
#' deleted and a new entry is created.
#'
|
32e75790 |
#' @param gdsProfile an object of class \link[gdsfmt]{gds.class} (a GDS file),
#' the opened Profile GDS file.
|
b3a84582 |
#'
#' @param pruned a \code{vector} of \code{character} string representing the
#' name of the SNVs.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
|
1c304a2a |
#' ## Required library
#' library(gdsfmt)
#'
#' ## Create a temporary GDS file in an test directory
|
5f024f9e |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_1.gds")
|
b3a84582 |
#'
|
5f024f9e |
#' ## Create and open the GDS file
#' tmpGDS <- createfn.gds(filename=gdsFilePath)
|
b3a84582 |
#'
|
5f024f9e |
#' ## Vector of low allelic fraction
#' study <- c("s19222", 's19588', 's19988', 's20588', 's23598')
|
b3a84582 |
#'
|
5f024f9e |
#' ## Add segments to the GDS file
#' RAIDS:::addGDSStudyPruning(gdsProfile=tmpGDS, pruned=study)
|
b3a84582 |
#'
|
5f024f9e |
#' ## Read lap information from GDS file
#' read.gdsn(index.gdsn(node=tmpGDS, path="pruned.study"))
|
b3a84582 |
#'
|
5f024f9e |
#' ## Close GDS file
#' closefn.gds(gdsfile=tmpGDS)
|
851a4988 |
#'
|
5f024f9e |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
851a4988 |
#'
|
b3a84582 |
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn
#' @encoding UTF-8
#' @keywords internal
|
aa3f0a99 |
addGDSStudyPruning <- function(gdsProfile, pruned) {
|
b3a84582 |
## Delete the pruned.study entry if present in the Profile GDS file
|
aa3f0a99 |
if("pruned.study" %in% ls.gdsn(gdsProfile)) {
delete.gdsn(index.gdsn(node=gdsProfile, "pruned.study"))
|
b3a84582 |
}
## Create the pruned.study node in the Profile GDS file
|
aa3f0a99 |
varPruned <- add.gdsn(node=gdsProfile, name="pruned.study", val=pruned)
|
b3a84582 |
# Write the data cached in memory to the Profile GDS file
|
aa3f0a99 |
sync.gds(gdsfile=gdsProfile)
|
b3a84582 |
return(0L)
}
|
2170193e |
#' @title Add information related to low allelic fraction associated to
#' the SNV dataset for a specific sample into a GDS file
#'
#' @description The function adds the information related to low allelic
#' fraction
#' associated to the SNV dataset for a specific sample into a
#' GDS file, more specifically, in the "lap" node. The "lap" node must
#' already be present in the GDS file.
#'
|
aa3f0a99 |
#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}}
|
2170193e |
#' (a GDS file), a GDS file.
#'
#' @param snpLap a \code{vector} of \code{numeric} value representing the
#' low allelic fraction for each SNV present in the SNV dataset. The
#' values should be between \code{0} and \code{0.50}. The
#' length of the \code{vector} should correspond to the number of SNVs
#' present in the "snp.id" entry of the GDS sample file.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
|
1c304a2a |
#' ## Required library
#' library(gdsfmt)
#'
|
1434071b |
#' ## Create a temporary GDS file
|
5f024f9e |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP.gds")
|
2170193e |
#'
|
5f024f9e |
#' ## Create and open the GDS file
#' gdsFile <- createfn.gds(filename=gdsFilePath)
|
2170193e |
#'
|
5f024f9e |
#' ## Create a "lap" node
#' add.gdsn(node=gdsFile, name="lap", val=rep(10L, 12))
#' sync.gds(gdsFile)
|
2170193e |
#'
|
5f024f9e |
#' ## Vector of low allelic fraction
#' lap <- c(0.1, 0.23, 0.34, 0.00, 0.12, 0.11, 0.33, 0.55)
|
2170193e |
#'
|
5f024f9e |
#' ## Add segments to the GDS file
#' RAIDS:::addUpdateLap(gdsProfile=gdsFile, snpLap=lap)
|
2170193e |
#'
|
5f024f9e |
#' ## Read lap information from GDS file
#' read.gdsn(index.gdsn(node=gdsFile, path="lap"))
|
2170193e |
#'
|
5f024f9e |
#' ## Close GDS file
#' closefn.gds(gdsfile=gdsFile)
|
851a4988 |
#'
|
5f024f9e |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
851a4988 |
#'
|
2170193e |
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn
#' @encoding UTF-8
#' @keywords internal
|
aa3f0a99 |
addUpdateLap <- function(gdsProfile, snpLap) {
|
2170193e |
|
aa3f0a99 |
snpLap2 <- write.gdsn(index.gdsn(gdsProfile, "lap"), snpLap)
|
2170193e |
|
aa3f0a99 |
sync.gds(gdsProfile)
|
2170193e |
return(0L)
}
|
040f95da |
#' @title Extract the block identifiers for a list of SNVs
#'
#' @description The function uses the GDS Reference Annotation file to extract
#' the unique block identifiers for a list of SNVs. The block type that is
#' going to be used to extract the information has to be provided by the
#' user.
#'
#' @param gdsRefAnnot an object of class \code{\link[gdsfmt]{gds.class}}
#' (a GDS file), the opened Reference SNV Annotation GDS file.
#'
#' @param snpIndex a \code{vectcor} of \code{integer} representing the
#' indexes of the SNVs of interest.
#'
#' @param blockTypeID a \code{character} string corresponding to the block
#' type used to extract the block identifiers. The block type must be
#' present in the GDS Reference Annotation file.
#'
#' @return a \code{vector} of \code{numeric} corresponding to the
#' block identifiers for the SNVs of interest.
#'
#' @examples
#'
#' # Required library
#' library(gdsfmt)
#'
#' ## Path to the demo 1KG Annotation GDS file located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
|
3751711a |
#' path1KG <- file.path(dataDir, "tests")
#' fileAnnotGDS <- file.path(path1KG, "ex1_good_small_1KG_Annot.gds")
|
040f95da |
#'
#' gdsRefAnnotation <- openfn.gds(fileAnnotGDS)
#'
#' ## The indexes for the SNVs of interest
#' snpIndex <- c(1,3,5,6,9)
#'
#' ## Extract the block identifiers for the SNVs represented by their indexes
#' ## for the block created using the genes from Hsapiens Ensembl v86
#' RAIDS:::getBlockIDs(gdsRefAnnot=gdsRefAnnotation, snpIndex=snpIndex,
#' blockTypeID="GeneS.Ensembl.Hsapiens.v86")
#'
#' closefn.gds(gdsRefAnnotation)
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn
#' @encoding UTF-8
#' @keywords internal
getBlockIDs <- function(gdsRefAnnot, snpIndex, blockTypeID) {
block.annot <- read.gdsn(index.gdsn(gdsRefAnnot, "block.annot"))
pos <- which(block.annot$block.id == blockTypeID)
if(length(pos) != 1) {
stop("The following block type is not found in the ",
|
6fc47f63 |
"GDS Annotation file: \'", blockTypeID, "\'")
|
040f95da |
}
b <- read.gdsn(index.gdsn(gdsRefAnnot, "block"), start=c(1, pos),
|
6fc47f63 |
count = c(-1, 1))[snpIndex]
|
040f95da |
return(b)
}
|
4cf4bc51 |
#' @title Add information related to segments associated to the SNV
#' dataset for a specific sample into a GDS file
#'
#' @description The function adds the information related to segments
#' associated to the SNV dataset for a specific sample into a
#' GDS file, more specifically, in the "segment" node. If the "segment" node
#' already exists, the previous information is erased.
#'
#' @param gdsProfile an object of class \code{\link[gdsfmt]{gds.class}}
#' (a GDS file), a GDS Sample file.
#'
#' @param snpSeg a \code{vector} of \code{integer} representing the segment
#' identifiers associated to each SNV selected for the specific sample. The
#' length of the \code{vector} should correspond to the number of SNVs
#' present in the "snp.id" entry of the GDS sample file.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
#' ## Required library
#' library(gdsfmt)
#'
#' ## Temporary GDS file
|
5f024f9e |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP.gds")
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Create and open the GDS file
#' GDS_file_tmp <- createfn.gds(filename=gdsFilePath)
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Vector of segment identifiers
#' segments <- c(1L, 1L, 1L, 2L, 2L, 3L, 3L)
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Add segments to the GDS file
#' RAIDS:::addUpdateSegment(gdsProfile=GDS_file_tmp, snpSeg=segments)
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Read segments information from GDS file
#' read.gdsn(index.gdsn(node=GDS_file_tmp, path="segment"))
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Close GDS file
#' closefn.gds(gdsfile=GDS_file_tmp)
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
4cf4bc51 |
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn
#' @encoding UTF-8
#' @keywords internal
addUpdateSegment <- function(gdsProfile, snpSeg) {
if("segment" %in% ls.gdsn(gdsProfile)) {
snpLap <- write.gdsn(index.gdsn(gdsProfile, "segment"), snpSeg)
} else{
snpLap <- add.gdsn(gdsProfile, "segment", snpSeg, storage="uint32")
}
sync.gds(gdsProfile)
## Successful
return(0L)
}
#' @title Append sample names into a GDS file
#'
#' @description This function append the sample identifiers into the
#' "samples.id" node of a GDS file.
#'
#' @param gds an object of class
#' \link[gdsfmt]{gds.class} (a GDS file), the opened GDS file.
#'
|
38d46f1e |
#' @param listSamples a \code{vector} of \code{character} string representing
|
4cf4bc51 |
#' the sample identifiers to be added to GDS file.
#'
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
#' ## Required library
#' library(gdsfmt)
#'
#' ## Temporary GDS file in current directory
|
5f024f9e |
#' gdsFilePath <- file.path(tempdir(), "GDS_TEMP_04.gds")
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Create and open the GDS file
#' GDS_file_tmp <- createfn.gds(filename=gdsFilePath)
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Create "sample.id" node (the node must be present)
#' add.gdsn(node=GDS_file_tmp, name="sample.id", val=c("sample_01",
|
4cf4bc51 |
#' "sample_02"))
#'
|
5f024f9e |
#' sync.gds(gdsfile=GDS_file_tmp)
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Add information about 2 samples to the GDS file
#' RAIDS:::appendGDSSampleOnly(gds=GDS_file_tmp,
|
4cf4bc51 |
#' listSamples=c("sample_03", "sample_04"))
#'
|
5f024f9e |
#' ## Read sample identifier list
#' ## Only "sample_03" and "sample_04" should have been added
#' read.gdsn(index.gdsn(node=GDS_file_tmp, path="sample.id"))
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Close GDS file
#' closefn.gds(gdsfile=GDS_file_tmp)
|
4cf4bc51 |
#'
|
5f024f9e |
#' ## Delete the temporary GDS file
#' unlink(x=gdsFilePath, force=TRUE)
|
4cf4bc51 |
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn append.gdsn
#' @encoding UTF-8
#' @keywords internal
appendGDSSampleOnly <- function(gds, listSamples) {
sampleGDS <- index.gdsn(gds, "sample.id")
append.gdsn(sampleGDS, val=listSamples, check=TRUE)
return(0L)
}
|
18056f22 |
#' @title Append information associated to ld blocks, as indexes, into the
#' Population Reference SNV Annotation GDS file
#'
#' @description The function appends the information about the ld blocks into
#' the Population Reference SNV Annotation GDS file. The information is
#' extracted from the parameter listBlock.
#'
#' @param gds an object of class \link[gdsfmt]{gds.class}
#' (GDS file), an opened Reference Annotation GDS file.
#'
#' @param listBlock a \code{array} of integer
#' representing the linkage disequilibrium block for
#' each SNV in the in the same order than the variant
#' in Population reference dataset.
#'
#' @param blockName a \code{character} string representing the id of the block.
#' The blockName should not exist in \'gdsRefAnnotFile\'.
#'
#' @param blockDesc a \code{character} string representing the description of
#' the block.
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
#' ## Required library for GDS
#' library(gdsfmt)
#' ## Path to the demo pedigree file is located in this package
#' dataDir <- system.file("extdata", package="RAIDS")
#'
# ## Temporary file
#' fileAnnotGDS <- file.path(tempdir(), "ex1_good_small_1KG_Ann_GDS.gds")
#'
#'
#' file.copy(file.path(dataDir, "tests",
#' "ex1_NoBlockGene.1KG_Annot_GDS.gds"), fileAnnotGDS)
#'
#'
#' fileReferenceGDS <- file.path(dataDir, "tests",
#' "ex1_good_small_1KG.gds")
#' \donttest{
#' gdsRef <- openfn.gds(fileReferenceGDS)
#' listBlock <- read.gdsn(index.gdsn(gdsRef, "snp.position"))
#' listBlock <- rep(-1, length(listBlock))
#' closefn.gds(gdsRef)
#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS, readonly=FALSE)
#' ## Append information associated to blocks
#' RAIDS:::addGDS1KGLDBlock(gds=gdsAnnot1KG,
#' listBlock=listBlock,
#' blockName="blockEmpty",
#' blockDesc="Example")
|
bdbca27f |
#'
#' closefn.gds(gdsAnnot1KG)
|
18056f22 |
#'
#' gdsAnnot1KG <- openfn.gds(fileAnnotGDS)
#' print(gdsAnnot1KG)
#'
#' closefn.gds(gdsAnnot1KG)
#' }
#'
#' ## Remove temporary file
#' unlink(fileAnnotGDS, force=TRUE)
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn ls.gdsn compression.gdsn
#' @importFrom gdsfmt append.gdsn sync.gds
#' @encoding UTF-8
#' @keywords internal
addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) {
blockAnnot <- data.frame(block.id=blockName,
block.desc=blockDesc,
stringsAsFactors=FALSE)
if(! ("block.annot" %in% ls.gdsn(gds))) {
varBlockAnnot <- add.gdsn(gds, "block.annot", blockAnnot)
}else {
curAnnot <- index.gdsn(gds, "block.annot/block.id")
append.gdsn(curAnnot,blockAnnot$block.id)
curAnnot <- index.gdsn(gds, "block.annot/block.desc")
append.gdsn(curAnnot, blockAnnot$block.desc)
}
varBlock <- NULL
if(! ("block" %in% ls.gdsn(gds))){
varBlock <- add.gdsn(gds, "block",
valdim=c(length(listBlock), 1),
listBlock, storage="int32",
compress = "LZ4_RA")
readmode.gdsn(varBlock)
}else {
if(is.null(varBlock)) {
varBlock <- index.gdsn(gds, "block")
varBlock <- compression.gdsn(varBlock, "")
}
append.gdsn(varBlock, listBlock)
varBlock <- compression.gdsn(varBlock, "LZ4_RA")
}
sync.gds(gds)
return(0L)
}
|