#' Create database of normal samples
#' 
#' Function to create a database of normal samples, used to find a good match
#' for tumor copy number normalization. Internally, this function determines
#' the sex of the samples and trains a PCA that is later used for clustering a
#' tumor file with all normal samples in the database.
#' 
#' 
#' @param normal.coverage.files Vector with file names pointing to GATK
#' coverage files of normal samples.
#' @param sex \code{character(length(normal.coverage.files))} with sex for all
#' files.  \code{F} for female, \code{M} for male. If all chromosomes are
#' diploid, specify \code{diploid}. If \code{NULL}, determine from coverage.
#' @param max.mean.coverage Assume that coverages above this value do not
#' necessarily improve copy number normalization. Internally, samples with
#' coverage higher than this value will be normalized to have mean coverage
#' equal to this value.
#' @param \dots Arguments passed to the \code{prcomp} function.
#' @return A normal database that can be used in the
#' \code{\link{findBestNormal}} function to retrieve good matching normal
#' samples for a given tumor sample.
#' @author Markus Riester
#' @seealso \code{\link{findBestNormal}}
#' @examples
#' 
#' normal.coverage.file <- system.file("extdata", "example_normal.txt", 
#'     package="PureCN")
#' normal2.coverage.file <- system.file("extdata", "example_normal2.txt", 
#'     package="PureCN")
#' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
#' normalDB <- createNormalDatabase(normal.coverage.files)
#' 
#' @export createNormalDatabase
#' @importFrom stats prcomp
createNormalDatabase <- function(normal.coverage.files, sex = NULL,
max.mean.coverage = 100, ... ) {
    normal.coverage.files <- normalizePath(normal.coverage.files)
    normals <- .readNormals(normal.coverage.files)

    normals.m <- do.call(cbind, 
        lapply(normals, function(x) x$average.coverage))
    idx <- complete.cases(normals.m)

    z <- apply(normals.m[idx,],2,mean)
    z <- sapply(max.mean.coverage/z, min,1)
    normals.m <- scale(normals.m, 1/z, center=FALSE)

    normals.pca <- prcomp(t(normals.m[idx,]), ...)
    sex.determined <- sapply(normals,getSexFromCoverage)
    if (is.null(sex)) {
        sex <- sex.determined
    } else {   
        if (length(sex) != length(normals)) {
            .stopUserError("Length of normal.coverage.files and sex different")
        } 
        idx.sex <- sex %in% c(NA, "F", "M", "diploid")
        sex[!idx.sex] <- NA
        if(sum(!idx.sex)>0) warning("Unexpected values in sex ignored.")   
        for (i in seq_along(sex.determined)) {
            if (!is.na(sex.determined[i]) && sex[i] != "diploid" &&
                sex.determined[i] != sex[i]) {
                flog.warn("Sex mismatch in %s. Sex provided is %s, but could be %s.", 
                    normal.coverage.files[i], sex[i], sex.determined[i])
            }    
        }    
    }
    list(
        normal.coverage.files=normal.coverage.files, 
        pca=normals.pca, 
        exons.used=idx, 
        coverage=apply(normals.m, 2, mean, na.rm=TRUE), 
        exon.median.coverage=apply(normals.m,1,median, na.rm=TRUE),
        exon.log2.sd.coverage=apply(log2(normals.m+1),1,sd, na.rm=TRUE),
        fraction.missing=apply(normals.m,1,function(x) 
            sum(is.na(x)|x<0.01))/ncol(normals.m),
        sex=sex
    )
}


#' Calculate exon weights
#' 
#' This function is defunct. Please use \code{\link{createTargetWeights}}
#' instead.
#' 
#' 
#' @author Markus Riester
#' @export createExonWeightFile
createExonWeightFile <- function() {
    .Defunct("createTargetWeights")
}



#' Calculate target weights
#' 
#' Creates a target weight file useful for segmentation. Requires a set of GATK
#' coverage files from normal samples. A small number of tumor (or other
#' normal) samples is then tested against all normals. Target weights will be
#' set proportional to the inverse of coverage standard deviation across all
#' normals. Targets with high variance in coverage in the pool of normals are
#' thus down-weighted.
#' 
#' 
#' @param tumor.coverage.files A small number (1-3) of GATK tumor or normal
#' coverage samples.
#' @param normal.coverage.files A large number of GATK normal coverage samples
#' (>20) to estimate target log-ratio standard deviations. Should not overlap
#' with files in \code{tumor.coverage.files}.
#' @param target.weight.file Output filename.
#' @return A \code{data.frame} with target weights.
#' @author Markus Riester
#' @examples
#' 
#' target.weight.file <- "target_weights.txt"
#' normal.coverage.file <- system.file("extdata", "example_normal.txt", 
#'     package="PureCN")
#' normal2.coverage.file <- system.file("extdata", "example_normal2.txt", 
#'     package="PureCN")
#' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
#' tumor.coverage.file <- system.file("extdata", "example_tumor.txt", 
#'     package="PureCN")
#' 
#' createTargetWeights(tumor.coverage.file, normal.coverage.files, target.weight.file)
#' 
#' @export createTargetWeights
createTargetWeights <- function(tumor.coverage.files, normal.coverage.files,
target.weight.file) {
    flog.info("Loading coverage data...")
    tumor.coverage <- lapply(tumor.coverage.files,  readCoverageGatk)
    normal.coverage <- lapply(normal.coverage.files,  readCoverageGatk)
    lrs <- lapply(tumor.coverage, function(tc) sapply(normal.coverage, 
            function(nc) calculateLogRatio(nc, tc)))

    lrs <- do.call(cbind, lrs)

    lrs[is.infinite(lrs)] <- NA

    lrs.sd <- apply(lrs, 1, sd, na.rm=TRUE)
    lrs.cnt.na <- apply(lrs,1, function(x) sum(is.na(x)))

    zz <- quantile(lrs.sd,probs=0.75, na.rm=TRUE)/lrs.sd
    zz[zz>1] <-1
    idx <- is.na(zz) | lrs.cnt.na > ncol(lrs)/3
    zz[idx] <- min(zz, na.rm=TRUE)
    ret <- data.frame(Target=tumor.coverage[[1]][,1], Weights=zz)

    write.table(ret, file=target.weight.file,row.names=FALSE, quote=FALSE, 
        sep="\t")
    invisible(ret)
}

.readNormals <- function(normal.coverage.files) {
    normals <- lapply(normal.coverage.files, readCoverageGatk)

    # check that all files used the same interval file.
    for (i in seq_along(normals)) {
        if (!identical(normals[[i]]$probe, normals[[1]]$probe)) {
            .stopUserError("All normal.coverage.files must have the same ",
                "intervals. ", normal.coverage.files[i], " is different.")
        }
    }
    normals
}