Bioconductor Code: PureCN

Raw Blame Patch Log History
#' Run PureCN implementation of ABSOLUTE
#' 
#' This function takes as input tumor and normal control coverage data and 
#' a VCF containing allelic fractions of germline variants and somatic 
#' mutations. Normal control does not need to be from the same patient. 
#' In case VCF does not contain somatic status, it should contain dbSNP and 
#' optionally COSMIC annotation. Returns purity and ploidy combinations, 
#' sorted by likelihood score. Provides copy number and LOH data, by both 
#' gene and genomic region.
#' 
#' 
#' @param normal.coverage.file Coverage file of normal control (optional
#' if log.ratio is provided - then it will be only used to filter low coverage
#' exons).  Should be already GC-normalized with
#' \code{\link{correctCoverageBias}}.  Needs to be either a file name or data
#' read with the \code{\link{readCoverageFile}} function.
#' @param tumor.coverage.file Coverage file of tumor. If \code{NULL},
#' requires \code{seg.file} and an interval file via \code{interval.file}.
#' Should be already GC-normalized with \code{\link{correctCoverageBias}}.
#' Needs to be either a file name or data read with the
#' \code{\link{readCoverageFile}} function.
#' @param log.ratio Copy number log-ratios for all exons in the coverage files.
#' If \code{NULL}, calculated based on coverage files.
#' @param seg.file Segmented data. Optional, to support matched SNP6 data or
#' third-pary segmentation tools.  If \code{NULL}, use coverage files or 
#' \code{log.ratio} to segment the data.
#' @param seg.file.sdev If \code{seg.file} provided, the log-ratio standard
#' deviation, used to model likelihood of sub-clonal copy number events.
#' @param vcf.file VCF file.
#' Optional, but typically needed to select between local optima of similar
#' likelihood. Can also be a \code{CollapsedVCF}, read with the \code{readVcf}
#' function.  Requires a DB info flag for dbSNP membership. The default
#' \code{fun.setPriorVcf} function will also look for a Cosmic.CNT slot (see
#' \code{cosmic.vcf.file}), containing the hits in the COSMIC database. Again,
#' do not expect very useful results without a VCF file.
#' @param normalDB Normal database, created with
#' \code{\link{createNormalDatabase}}. If provided, used to calculate gene-level
#' p-values (requires \code{Gene} column in \code{interval.file}) and to filter
#' targets with low coverage in the pool of normal samples.
#' @param genome Genome version, for example hg19. See \code{readVcf}.
#' @param centromeres A \code{GRanges} object with centromere positions.
#' If \code{NULL}, use pre-stored positions for genome versions
#' hg18, hg19 and hg38.
#' @param sex Sex of sample. If \code{?}, detect using
#' \code{\link{getSexFromCoverage}} function and default parameters.  Default
#' parameters might not work well with every assay and might need to be tuned.
#' If set to diploid, then PureCN will assume all chromosomes are diploid and
#' will not try to detect sex.
#' @param fun.filterVcf Function for filtering variants. Expected output is a
#' list with elements \code{vcf} (\code{CollapsedVCF}), flag
#' (\code{logical(1)}) and \code{flag_comment} (\code{character(1)}). The flags
#' will be added to the output data and can be used to warn users, for example
#' when samples look too noisy. Default filter will remove variants flagged by
#' MuTect, but will keep germline variants. If ran in matched normal mode, it
#' will by default use somatic status of variants and filter non-somatic calls
#' with allelic fraction significantly different from 0.5 in normal. Defaults
#' to \code{\link{filterVcfMuTect}}, which in turn also calls
#' \code{\link{filterVcfBasic}}.
#' @param args.filterVcf Arguments for variant filtering function. Arguments
#' \code{vcf}, \code{tumor.id.in.vcf}, \code{min.coverage},
#' \code{model.homozygous} and \code{error} are required in the
#' filter function and are automatically set.
#' @param fun.setPriorVcf Function to set prior for somatic status for each
#' variant in the VCF. Defaults to \code{\link{setPriorVcf}}.
#' @param args.setPriorVcf Arguments for somatic prior function.
#' @param fun.setMappingBiasVcf Function to set mapping bias for each variant
#' in the VCF. Defaults to \code{\link{setMappingBiasVcf}}.
#' @param args.setMappingBiasVcf Arguments for mapping bias function.
#' @param fun.filterIntervals Function for filtering low-quality intervals in the
#' coverage files. Needs to return a \code{logical} vector whether an interval
#' should be used for segmentation. Defaults to \code{\link{filterIntervals}}.
#' @param fun.filterTargets Deprecated.
#' @param args.filterIntervals Arguments for target filtering function. Arguments
#' \code{normal}, \code{tumor}, \code{log.ratio},  
#' \code{min.coverage}\code{seg.file} and \code{normalDB} are required and 
#' automatically set. 
#' @param args.filterTargets Deprecated.
#' @param fun.segmentation Function for segmenting the copy number log-ratios.
#' Expected return value is a \code{data.frame} representation of the
#' segmentation. Defaults to \code{\link{segmentationCBS}}.
#' @param args.segmentation Arguments for segmentation function. Arguments
#' \code{normal}, \code{tumor}, \code{log.ratio}, \code{plot.cnv},
#' \code{sampleid}, \code{vcf}, \code{tumor.id.in.vcf},
#' \code{centromeres} are required in the segmentation function 
#' and automatically set.
#' @param fun.focal Function for identifying focal amplifications. Defaults to
#' \code{\link{findFocal}}.
#' @param args.focal Arguments for focal amplification function.
#' @param sampleid Sample id, provided in output files etc.
#' @param min.ploidy Minimum ploidy to be considered.
#' @param max.ploidy Maximum ploidy to be considered.
#' @param test.num.copy Copy numbers tested in the grid search. Note that focal
#' amplifications can have much higher copy numbers, but they will be labeled
#' as subclonal (because they do not fit the integer copy numbers).
#' @param test.purity Considered tumor purity values.
#' @param prior.purity \code{numeric(length(test.purity))} with priors for
#' tested purity values. If \code{NULL}, use flat priors.
#' @param prior.K This defines the prior probability that the multiplicity of a
#' SNV corresponds to either the maternal or the paternal copy number (for
#' somatic variants additionally to a multiplicity of 1). For perfect
#' segmentations, this value would be 1; values smaller than 1 thus may provide
#' some robustness against segmentation errors.
#' @param prior.contamination The prior probability that a known SNP is from a
#' different individual.
#' @param max.candidate.solutions Number of local optima considered in
#' optimization and variant fitting steps. If there are too many local optima,
#' it will use specified number of top candidate solutions, but will also
#' include all optima close to diploid, because silent genomes have often lots
#' of local optima.
#' @param candidates Candidates to optimize from a previous run
#' (\code{return.object$candidates}).  If \code{NULL}, do 2D grid search and
#' find local optima.
#' @param min.coverage Minimum coverage in both normal and tumor. Intervals and 
#' variants with lower coverage are ignored. This value is provided to the 
#' \code{args.filterIntervals} and \code{args.filterVcf} lists, but can be 
#' overwritten in these lists if different cutoffs for the coverage and variant
#' filters are wanted. To increase the sensitivity of homozygous deletions in 
#' high purity samples, the coverage cutoff in tumor is automatically lowered by 
#' 50 percent if the normal coverage is high.
#' @param max.coverage.vcf This will set the maximum number of reads in the SNV
#' fitting.  This is to avoid that small non-reference biases that come
#' apparent only at high coverages have a dramatic influence on likelihood
#' scores.
#' @param max.non.clonal Maximum genomic fraction assigned to a subclonal copy
#' number state.
#' @param max.homozygous.loss \code{double(2)} with maximum chromosome fraction 
#' assigned to homozygous loss and maximum size of a homozygous loss segment.  
#' @param non.clonal.M Average expected cellular fraction of sub-clonal somatic
#' mutations. This is to calculate expected allelic fractions of a single
#' sub-clonal bin for variants. For all somatic variants, more accurate cellular
#' fractions are calculated.
#' @param max.mapping.bias Exclude variants with high mapping bias from the
#' likelihood score calculation. Note that bias is reported on an inverse
#' scale; a variant with mapping bias of 1 has no bias.
#' @param max.pon Exclude variants found more than \code{max.pon} times in 
#' pool of normals and not in dbSNP. Requires \code{normal.panel.vcf.file} in
#' \code{\link{setMappingBiasVcf}}. Should be set to a value high enough 
#' to be much more likely an artifact and not a true germline variant not
#' present in dbSNP.
#' @param min.variants.segment Flag segments with fewer variants. The
#' minor copy number estimation is not reliable with insufficient variants.
#' @param iterations Maximum number of iterations in the Simulated Annealing
#' copy number fit optimization. Note that this an integer optimization problem
#' that should converge quickly. Allowed range is 10 to 250.
#' @param log.ratio.calibration Re-calibrate log-ratios in the window
#' \code{purity*log.ratio.calibration}.
#' @param smooth.log.ratio Smooth \code{log.ratio} using the \code{DNAcopy}
#' package.
#' @param model.homozygous Homozygous germline SNPs are uninformative and by
#' default removed. In 100 percent pure samples such as cell lines, however,
#' heterozygous germline SNPs appear homozygous in case of LOH. Setting this
#' parameter to \code{TRUE} will keep homozygous SNPs and include a homozygous
#' SNP state in the likelihood model. Not necessary when matched normal samples
#' are available.
#' @param error Estimated sequencing error rate. Used to calculate minimum
#' number of supporting reads for variants using
#' \code{\link{calculatePowerDetectSomatic}}. Also used to calculate the
#' probability of homozygous SNP allelic fractions (assuming reference reads
#' are sequencing errors).
#' @param interval.file A mapping file that assigns GC content and gene symbols
#' to each exon in the coverage files. Used for generating gene-level calls.
#' First column in format CHR:START-END. Second column GC content (0 to 1).
#' Third column gene symbol. This file is generated with the
#' \code{\link{preprocessIntervals}} function.
#' @param max.dropout Measures GC bias as ratio of coverage in AT-rich (GC <
#' 0.5) versus GC-rich on-target regions (GC >= 0.5). High drop-out might 
#' indicate that  data was not GC-normalized or that the sample quality might 
#' be insufficient.
#' Requires \code{interval.file}.
#' @param min.logr.sdev Minimum log-ratio standard deviation used in the 
#' model. Useful to make fitting more robust to outliers in very clean
#' data.
#' @param max.logr.sdev Flag noisy samples with segment log-ratio standard
#' deviation larger than this. Assay specific and needs to be calibrated.
#' @param max.segments Flag noisy samples with a large number of segments.
#' Assay specific and needs to be calibrated.
#' @param min.gof Flag purity/ploidy solutions with poor fit.
#' @param plot.cnv Generate segmentation plots.
#' @param cosmic.vcf.file Add a \code{Cosmic.CNT} info field to the provided
#' \code{vcf.file} using a VCF file containing the COSMIC database. The default
#' \code{fun.setPriorVcf} function will give variants found in the COSMIC database
#' a higher prior probability of being somatic. Not used in likelhood model
#' when matched normal is available in \code{vcf.file}. Should be compressed
#' and indexed with bgzip and tabix, respectively.
#' @param DB.info.flag Flag in INFO of VCF that marks presence in common
#' germline databases. Defaults to \code{DB} that may contain somatic variants
#' if it is from an unfiltered dbSNP VCF.
#' @param POPAF.info.field As alternative to a flag, use an info field that
#' contains population allele frequencies. The \code{DB} info flag has priority
#' over this field when both exist.
#' @param min.pop.af Minimum population allele frequency in 
#' \code{POPAF.info.field} to set a high germline prior probability.
#' @param model Use either a beta or a beta-binomial distribution for fitting
#' observed to expected allelic fractions of alterations in \code{vcf.file}. 
#' The latter can be useful to account for significant overdispersion, for example 
#' due to mapping biases when no pool of normals is available or due to other
#' unmodeled biases, e.g. amplification biases.
#' The amount of expected overdispersion can be controlled via the 
#' \code{max.coverage.vcf} argument (the higher, the less expected bias).
#' @param post.optimize Optimize purity using final SCNA-fit and variants. This
#' might take a long time when lots of variants need to be fitted, but will
#' typically result in a slightly more accurate purity, especially for rather
#' silent genomes or very low purities. Otherwise, it will just use the purity
#' determined via the SCNA-fit.
#' @param speedup.heuristics Tries to avoid spending computation time on
#' local optima that are unlikely correct. Set to 0 to turn this off, to 1 to 
#' only apply heuristics that in worst case will decrease accuracy slightly or 
#' to 2 to turn on all heuristics. 
#' @param BPPARAM \code{BiocParallelParam} object. If \code{NULL}, does not
#' use parallelization for fitting local optima.
#' @param log.file If not \code{NULL}, store verbose output to file.
#' @param verbose Verbose output.
#' @return A list with elements \item{candidates}{Results of the grid search.}
#' \item{results}{All local optima, sorted by final rank.} \item{input}{The
#' input data.}
#' @author Markus Riester
#' @references Riester et al. (2016). PureCN: Copy number calling and SNV 
#' classification using targeted short read sequencing. Source Code for Biology 
#' and Medicine, 11, pp. 13. 
#'
#' Carter et al. (2012), Absolute quantification of somatic DNA alterations in 
#' human cancer. Nature Biotechnology.
#'
#' @seealso \code{\link{correctCoverageBias}} \code{\link{segmentationCBS}}
#' \code{\link{calculatePowerDetectSomatic}}
#' @examples
#' 
#' normal.coverage.file <- system.file('extdata', 'example_normal_tiny.txt', 
#'     package='PureCN')
#' tumor.coverage.file <- system.file('extdata', 'example_tumor_tiny.txt', 
#'     package='PureCN')
#' vcf.file <- system.file('extdata', 'example.vcf.gz', 
#'     package='PureCN')
#' interval.file <- system.file('extdata', 'example_intervals_tiny.txt', 
#'     package='PureCN')
#' 
#' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
#' # non-default values to speed-up this example.  This is not a good idea for real
#' # samples.
#' ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, 
#'     tumor.coverage.file=tumor.coverage.file, genome='hg19', vcf.file=vcf.file,
#'     sampleid='Sample1', interval.file=interval.file,
#'     max.ploidy=4, test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
#' 
#' 
#' # If a high-quality segmentation was obtained with third-party tools:
#' seg.file <- system.file('extdata', 'example_seg.txt', 
#'     package = 'PureCN')
#' 
#' # By default, PureCN will re-segment the data, for example to identify
#' # regions of copy number neutral LOH. If this is not wanted, we can provide
#' # a minimal segmentation function which just returns the provided one:
#' funSeg <- function(seg, ...) return(seg)
#' 
#' res <- runAbsoluteCN(seg.file=seg.file, fun.segmentation=funSeg, max.ploidy = 4,
#'     test.purity = seq(0.3, 0.7, by = 0.05), max.candidate.solutions=1,
#'     genome='hg19', interval.file=interval.file)
#' 
#' @export runAbsoluteCN
#' @import DNAcopy
#' @import IRanges
#' @import VariantAnnotation
#' @importFrom GenomicRanges GRanges tile
#' @importFrom stats complete.cases dbeta dnorm dunif runif weighted.mean
#'             dbinom C
#' @importFrom utils data read.delim tail packageVersion object.size
#' @importFrom S4Vectors queryHits subjectHits DataFrame expand
#' @importFrom data.table data.table
#' @importFrom futile.logger flog.info flog.warn flog.fatal flog.debug
#'             flog.threshold flog.appender appender.tee
#' @importFrom VGAM dbetabinom.ab
runAbsoluteCN <- function(normal.coverage.file = NULL, 
    tumor.coverage.file = NULL, log.ratio = NULL, seg.file = NULL, 
    seg.file.sdev = 0.4, vcf.file = NULL, normalDB = NULL, genome, 
    centromeres = NULL, sex = c("?", "F", "M", "diploid"), 
    fun.filterVcf = filterVcfMuTect, args.filterVcf = list(), 
    fun.setPriorVcf = setPriorVcf, args.setPriorVcf = list(), 
    fun.setMappingBiasVcf = setMappingBiasVcf, args.setMappingBiasVcf = list(), 
    fun.filterIntervals = filterIntervals, args.filterIntervals = list(),
    fun.filterTargets = NULL, args.filterTargets = NULL,
    fun.segmentation = segmentationCBS, args.segmentation = list(), 
    fun.focal = findFocal, args.focal = list(), 
    sampleid = NULL, min.ploidy = 1, max.ploidy = 6, test.num.copy = 0:7, 
    test.purity = seq(0.15, 0.95, by = 0.01), prior.purity = NULL, 
    prior.K = 0.999, prior.contamination = 0.01, max.candidate.solutions = 20,
    candidates = NULL, min.coverage = 15, max.coverage.vcf = 300, 
    max.non.clonal = 0.2, max.homozygous.loss = c(0.05, 1e07), 
    non.clonal.M = 1/3, max.mapping.bias = 0.8, max.pon = 3, iterations = 30, 
    min.variants.segment = 5, log.ratio.calibration = 0.1, smooth.log.ratio = TRUE, 
    model.homozygous = FALSE, error = 0.001,
    interval.file = NULL, max.dropout = c(0.95, 1.1), 
    min.logr.sdev = 0.15, max.logr.sdev = 0.6, 
    max.segments = 300, min.gof = 0.8, plot.cnv = TRUE, 
    cosmic.vcf.file = NULL, DB.info.flag = "DB", 
    POPAF.info.field = "POP_AF", min.pop.af = 0.001,
    model = c("beta", "betabin"),
    post.optimize = FALSE, speedup.heuristics = 2, BPPARAM = NULL,
    log.file = NULL, verbose = TRUE) {

    if (!verbose) flog.threshold("WARN")
    if (!is.null(log.file)) flog.appender(appender.tee(log.file))
    
    # TODO Remove in 1.14.0
    if (!is.null(fun.filterTargets)) {
        flog.warn("fun.filterTargets was renamed to fun.filterIntervals.")
    }    
    if (!is.null(args.filterTargets)) {
        flog.warn("args.filterTargets was renamed to args.filterIntervals.")
    }
     # log function arguments     
    try(.logHeader(as.list(match.call())[-1]), silent=TRUE)

    model <- match.arg(model)

    # defaults to equal priors for all tested purity values
    if (is.null(prior.purity)) {
        prior.purity <- rep(1, length(test.purity))/length(test.purity)
    }
    # argument checking
    .checkParameters(test.purity, min.ploidy, max.ploidy, max.non.clonal,
        max.homozygous.loss, sampleid, prior.K, prior.contamination, prior.purity,
        iterations, min.gof, model.homozygous, interval.file, log.ratio.calibration,
        test.num.copy)
    
    test.num.copy <- sort(test.num.copy)

    if (!is.null(BPPARAM) && !requireNamespace("BiocParallel", quietly = TRUE)) {
        flog.warn("Install BiocParallel for parallel optimization.")
        BPPARAM <- NULL
    } else if (!is.null(BPPARAM)) {
        flog.info("Using BiocParallel for parallel optimization.")
    }
    
    flog.info("Loading coverage files...")
    
    if (!is.null(normal.coverage.file)) {
        if (is.character(normal.coverage.file)) {
            normal.coverage.file <- normalizePath(normal.coverage.file, mustWork = TRUE)
            normal <- readCoverageFile(normal.coverage.file)
        } else {
            normal <- normal.coverage.file
        }
    }
    if (is.null(tumor.coverage.file)) {
        if ( (is.null(seg.file) && is.null(log.ratio)) || 
            is.null(interval.file)) {
            .stopUserError("Missing tumor.coverage.file requires seg.file or ", 
                           "log.ratio and interval.file.")
        }
        tumor <- .gcGeneToCoverage(interval.file, min.coverage + 1)
        tumor.coverage.file <- tumor
    } else if (is.character(tumor.coverage.file)) {
        tumor.coverage.file <- normalizePath(tumor.coverage.file, mustWork = TRUE)
        tumor <- readCoverageFile(tumor.coverage.file)
        if (is.null(sampleid)) 
            sampleid <- basename(tumor.coverage.file)
    } else {
        tumor <- tumor.coverage.file
        if (is.null(sampleid)) 
            sampleid <- "Sample.1"
    }
    
    chr.hash <- .getChrHash(seqlevels(tumor))
    
    # check that normal is not the same as tumor (only if no log-ratio or
    # segmentation is provided, in that case we wouldn't use normal anyway)
    if (!is.null(normal.coverage.file) & is.null(log.ratio) & is.null(seg.file)) {
        if (identical(tumor$average.coverage, normal$average.coverage)) {
            .stopUserError("Tumor and normal are identical. This won't give any", 
                " meaningful results and I'm stopping here.")
        }
    }
    
    # this ugly if else chain covers the 3 possible ways of segmenting the data: if
    # there is no log-ratio provided, we either need to calculate (case 1) or create
    # fake log-ratios from a provided segmentation file (case 2). Otherwise we just
    # take the provided log-ratio (case 3).
    if (is.null(log.ratio)) {
        if (!is.null(seg.file)) {
            if (is.null(normal.coverage.file)) 
                normal <- tumor
            if (is.null(normal.coverage.file)) {    
                log.ratio <- .createFakeLogRatios(tumor, seg.file, sampleid, 
                                                  chr.hash, model.homozygous, max.logr.sdev)
                smooth.log.ratio <- FALSE
            } else {
                flog.info("seg.file and normal.coverage.file provided. Using both.")
                log.ratio <- calculateLogRatio(normal, tumor)
            }
            if (is.null(sampleid)) 
                sampleid <- read.delim(seg.file, as.is=TRUE)[1, 1]
        } else {
            if (is.null(normal.coverage.file)) {
                .stopUserError("Need a normal coverage file if log.ratio and seg.file are not", 
                  " provided.")
            }
            log.ratio <- calculateLogRatio(normal, tumor)
        }
    } else {
        # the segmentation algorithm will remove targets with low coverage in both tumor
        # and normal, so we just use tumor if there is no normal coverage file.
        if (is.null(normal.coverage.file)) 
            normal <- tumor
        if (length(log.ratio) != length(tumor)) {
            .stopUserError("Length of log.ratio different from tumor ", 
                           "coverage.")
        }
        if (is.null(sampleid)) 
            sampleid <- "Sample.1"
    }
    
    sex <- .getSex(match.arg(sex), normal, tumor)
    tumor <- .fixAllosomeCoverage(sex, tumor)
    
    if (!is.null(interval.file)) {
        tumor <- .addGCData(tumor, interval.file)
    }
    
    args.filterIntervals <- c(list(normal=normal, tumor = tumor, 
        log.ratio = log.ratio, seg.file = seg.file, 
        normalDB = normalDB), args.filterIntervals)
    # not needed anymore
    normalDB <- NULL
    # make it possible to provide different coverages for the different
    # filters 
    if (is.null(args.filterIntervals$min.coverage)) {
        args.filterIntervals$min.coverage <- min.coverage
    }
    
    intervalsUsed <- do.call(fun.filterIntervals, 
        .checkArgs(args.filterIntervals, "filterIntervals"))
    
    # chr.hash is an internal data structure, so we need to do this separately.
    intervalsUsed <- .filterIntervalsChrHash(intervalsUsed, tumor, chr.hash)
    intervalsUsed <- which(intervalsUsed)
    if (length(tumor) != length(normal) || 
        length(tumor) != length(log.ratio)) {
        .stopRuntimeError("Mis-aligned coverage data.")
    }
    tumor <- tumor[intervalsUsed, ]
    normal <- normal[intervalsUsed, ]
    log.ratio <- log.ratio[intervalsUsed]
    flog.info("Using %i intervals (%i on-target, %i off-target).", length(tumor), 
        sum(tumor$on.target, na.rm=TRUE), sum(!tumor$on.target, na.rm=TRUE))
    
    if (!length(tumor)) {
        .stopUserError("No intervals passing filters.")
    }
    if (!sum(!tumor$on.target, na.rm=TRUE)) {
        flog.info("No off-target intervals. If this is hybrid-capture data,%s",
            " consider adding them.")
    } else {
         flog.info("Ratio of mean on-target vs. off-target read counts: %.2f",  
            mean(tumor$counts[tumor$on.target], na.rm = TRUE) /
            mean(tumor$counts[!tumor$on.target], na.rm = TRUE))
         
         flog.info("Mean off-target bin size: %.0f",
            mean(width(tumor[!tumor$on.target]), na.rm = TRUE))
    }
    if (smooth.log.ratio) {
        CNA.obj <- smooth.CNA(
            .getCNAobject(log.ratio, normal, chr.hash, "sample"))
        log.ratio <- CNA.obj$sample
    }     
    
    dropoutWarning <- FALSE
    # clean up noisy targets, but not if the segmentation was already provided.
    if (is.null(seg.file)) {
        if (!is.null(interval.file)) {
            dropoutWarning <- .checkGCBias(normal, tumor, max.dropout)
        } else {
            flog.info("No interval.file provided. Cannot check if data was GC-normalized. Was it?")
        }
    }
    
    vcf <- NULL
    vcf.germline <- NULL
    tumor.id.in.vcf <- NULL
    normal.id.in.vcf <- NULL
    prior.somatic <- NULL
    mapping.bias <- NULL
    vcf.filtering <- list(flag = FALSE, flag_comment = "")
    sex.vcf <- NULL
    
    if (!is.null(vcf.file)) {
        flog.info("Loading VCF...")
        vcf <- .readAndCheckVcf(vcf.file, genome = genome, 
            DB.info.flag = DB.info.flag, POPAF.info.field = POPAF.info.field,
            min.pop.af = min.pop.af)
        
        if (length(intersect(seqlevels(tumor), seqlevels(vcf))) < 1) {
            .stopUserError("Different chromosome names in coverage and VCF.")
        }
        
        if (is.null(args.filterVcf$use.somatic.status)) {
            args.filterVcf$use.somatic.status <- TRUE
        }
        
        if (sum(colSums(geno(vcf)$DP) > 0) == 1 && args.filterVcf$use.somatic.status) {
            args.filterVcf$use.somatic.status <- FALSE
        }
        
        tumor.id.in.vcf <- .getTumorIdInVcf(vcf, sampleid = sampleid)
        
        if (args.filterVcf$use.somatic.status) {
            normal.id.in.vcf <- .getNormalIdInVcf(vcf, tumor.id.in.vcf)
        }
        
        flog.info("%s is tumor in VCF file.", tumor.id.in.vcf)
        if (sex != "diploid") {
            sex.vcf <- getSexFromVcf(vcf, tumor.id.in.vcf, 
                use.somatic.status=args.filterVcf$use.somatic.status)
            if (!is.na(sex.vcf) && sex %in% c("F", "M") && sex.vcf != sex) {
                flog.warn("Sex mismatch of coverage and VCF. %s%s", 
                        "Could be because of noisy data, contamination, ", 
                  "loss of chrY or a mis-alignment of coverage and VCF.")
            }
        }
        n.vcf.before.filter <- nrow(vcf)
        
        args.filterVcf <- c(list(vcf = vcf, tumor.id.in.vcf = tumor.id.in.vcf, 
            model.homozygous = model.homozygous, error = error, 
            DB.info.flag = DB.info.flag,
            target.granges = tumor[tumor$on.target]), args.filterVcf)
        if (is.null(args.filterVcf$min.coverage)) {
            args.filterVcf$min.coverage <- min.coverage
        }
        vcf.filtering <- do.call(fun.filterVcf, 
            .checkArgs(args.filterVcf, "filterVcf"))
        
        vcf <- vcf.filtering$vcf
        
        if (!is.null(cosmic.vcf.file)) {
            vcf <- .addCosmicCNT(vcf, cosmic.vcf.file)
        }
        
        args.setPriorVcf <- c(list(vcf = vcf, tumor.id.in.vcf = tumor.id.in.vcf, 
            DB.info.flag = DB.info.flag), args.setPriorVcf)
        prior.somatic <- do.call(fun.setPriorVcf, 
            .checkArgs(args.setPriorVcf, "setPriorVcf"))
        
        # get mapping bias
        args.setMappingBiasVcf$vcf <- vcf
        args.setMappingBiasVcf$tumor.id.in.vcf <- tumor.id.in.vcf
        mapping.bias <- do.call(fun.setMappingBiasVcf,
            .checkArgs(args.setMappingBiasVcf, "setMappingBiasVcf"))
        idxHqGermline <- prior.somatic < 0.1 & mapping.bias$bias >= max.mapping.bias
        flog.info("Excluding %i novel or poor quality variants from segmentation.", sum(!idxHqGermline))
        # for larger pool of normals, require that we have seen the SNP 
        if (sum(!is.na(mapping.bias$pon.count)) && 
            max(mapping.bias$pon.count, na.rm = TRUE)> 10) {
            idxHqGermline <- idxHqGermline & mapping.bias$pon.count > 0
            flog.info("Excluding %i variants not in pool of normals from segmentation.",
                 sum(!mapping.bias$pon.count>0))
        }
        vcf.germline <- vcf[idxHqGermline]
    }
    
    flog.info("Sample sex: %s", sex)
    flog.info("Segmenting data...")
    segProvided <- .loadSegFile(seg.file, sampleid, model.homozygous)

    centromeres <- .getCentromerePositions(centromeres, genome, 
        if (is.null(vcf)) NULL else seqlevelsStyle(vcf))
    
    args.segmentation <- c(list(normal = normal, tumor = tumor, log.ratio = log.ratio, 
        seg = segProvided, plot.cnv = plot.cnv,  
        sampleid = sampleid, vcf = vcf.germline, tumor.id.in.vcf = tumor.id.in.vcf, 
        normal.id.in.vcf = normal.id.in.vcf, max.segments = max.segments, chr.hash = chr.hash, 
        centromeres = centromeres), args.segmentation)
    
    vcf.germline <- NULL
    seg <- do.call(fun.segmentation,
            .checkArgs(args.segmentation, "segmentation"))

    if (!is.null(seg.file)) {
        seg <- .fixAllosomeSegmentation(sex, seg)
    }

    if (speedup.heuristics > 1) {
        ds <- .getSizeDomState(seg)
        if (ds$fraction.genome > 0.5 && abs(ds$seg.mean) < 0.1) {
            min.ploidy <- 1.5
            max.ploidy <- 3
            flog.info("Highly dominant copy number state with small log-ratio.%s",
                " Skipping search for high ploidy solutions.")
        }
    }
    seg.gr <- GRanges(seqnames = .add.chr.name(seg$chrom, chr.hash), 
                IRanges(start = round(seg$loc.start), end = seg$loc.end))

    flog.info("Found %i segments with median size of %.2fMb.",
              length(seg.gr), median(width(seg.gr)/1e+6))

    snv.lr <- NULL
    
    if (!is.null(vcf.file)) {
        ov <- findOverlaps(vcf, seg.gr, select = "first")
        
        # Add segment log-ratio to off-target snvs.  For on-target, use observed
        # log-ratio.
        snv.lr <- seg$seg.mean[ov]
        ov.vcfexon <- findOverlaps(vcf, tumor)
        snv.lr[queryHits(ov.vcfexon)] <- log.ratio[subjectHits(ov.vcfexon)]
        if (anyNA(snv.lr)) {
            n.vcf.before.filter <- nrow(vcf)
            vcf <- vcf[!is.na(snv.lr)]
            mapping.bias$bias <- mapping.bias$bias[!is.na(snv.lr)]
            if (!is.null(mapping.bias$pon.count)) {
                mapping.bias$pon.count <- mapping.bias$pon.count[!is.na(snv.lr)]
            }  
            prior.somatic <- prior.somatic[!is.na(snv.lr)]
            
            # make sure all variants are in covered segments
            flog.info("Removing %i variants outside segments.", n.vcf.before.filter - nrow(vcf))
        }
        ov <- findOverlaps(seg.gr, vcf)
        flog.info("Using %i variants.", nrow(vcf))
    }
    
    # get target log-ratios for all segments
    ov.se <- findOverlaps(seg.gr, tumor)
    exon.lrs <- lapply(seq_len(nrow(seg)), function(i) log.ratio[subjectHits(ov.se)[queryHits(ov.se) == 
        i]])
    exon.lrs <- lapply(exon.lrs, function(x) subset(x, !is.na(x) & !is.infinite(x)))
    
    # estimate stand. dev. for target logR within targets. this will be used as proxy
    # for sample error.
    targetsPerSegment <- vapply(exon.lrs, length, double(1))
    
    if (!sum(targetsPerSegment > 50, na.rm = TRUE)) {
        .stopRuntimeError("Only tiny segments.")
    }
    
    sd.seg <- max(median(vapply(exon.lrs, sd, double(1)), na.rm = TRUE),
                  min.logr.sdev)
   
    # if user provided seg file, then we do not have access to the log-ratios and
    # need to use the user provided noise estimate also, don't do outlier smoothing
    # when we use already segmented data
    if (!is.null(seg.file) && .isFakeLogRatio(log.ratio) ) {
        sd.seg <- seg.file.sdev
    }
    
    # renormalize, in case segmentation function changed means
    exon.lrs <- .postprocessLogRatios(exon.lrs, seg$seg.mean)

    max.exon.ratio <- 4
    
    # show log-ratio histogram and on/off-target log-ratios
    if (plot.cnv) {
        if (!is.null(segProvided)) {
            par(mfrow = c(2, 1))
            hist(do.call(c, lapply(seq_len(nrow(segProvided)), function(i) rep(segProvided$seg.mean[i], 
                segProvided$num.mark[i]))), breaks = 100, xlab = "log2 ratio", main = paste(sampleid, 
                "(original segmentation)"))
        }
        hist(do.call(c, lapply(seq_len(nrow(seg)), function(i) rep(seg$seg.mean[i], 
            seg$num.mark[i]))), breaks = 100, xlab = "log2 ratio", main = sampleid)
        par(mfrow = c(1, 1))
        .plotLogRatios(log.ratio, tumor$on.target)
    }
    
    # initialize variables
    llik <- -Inf
    li <- .getSegSizes(seg)
    C <- rep(2, length(li))
    
    if (sum(li < 0) > 0) 
        .stopRuntimeError("Some segments have negative size.")
    flog.info("Mean standard deviation of log-ratios: %.2f", sd.seg)
    log.ratio.offset <- rep(0, nrow(seg))
    
    flog.info("2D-grid search of purity and ploidy...")
    
    # find local maxima. use a coarser grid for purity, otherwise we will get far too
    # many solutions, which we will need to cluster later anyways.
    if (!is.null(candidates)) {
        candidate.solutions <- candidates
    } else {
        candidate.solutions <- .optimizeGrid(.get2DPurityGrid(test.purity),
            min.ploidy, max.ploidy, test.num.copy = test.num.copy, 
            exon.lrs, seg, sd.seg, li, max.exon.ratio, max.non.clonal, BPPARAM)
        
        # if we have > 20 somatic mutations, we can try estimating purity based on
        # allelic fractions and assuming diploid genomes.
        if (!is.null(vcf.file) && sum(prior.somatic > 0.5, na.rm = TRUE) > 20) {
            somatic.purity <- min(max(test.purity), .calcPuritySomaticVariants(vcf, 
                prior.somatic, tumor.id.in.vcf))
            
            candidate.solutions$candidates <- .filterDuplicatedCandidates(
                rbind(candidate.solutions$candidates, 
                      c(2, somatic.purity, NA, 2)))
            
        }
    }
    
    if (nrow(candidate.solutions$candidates) > max.candidate.solutions) {
        # test the best solutions and everything close to diploid
        idx.keep <- unique(c(seq_len(max.candidate.solutions), which((candidate.solutions$candidates$tumor.ploidy > 
            1.5 & candidate.solutions$candidates$tumor.ploidy < 2.6))))
        candidate.solutions$candidates <- candidate.solutions$candidates[idx.keep, 
            ]
    }
    flog.info(paste(strwrap(paste("Local optima:\n", paste(round(candidate.solutions$candidates$purity, 
        digits = 2), round(candidate.solutions$candidates$ploidy, digits = 2), 
        sep = "/", collapse = ", "))), collapse = "\n"))
    
    simulated.annealing <- TRUE
    
    .optimizeSolution <- function(cpi) {
        
        max.attempts <- 4
        attempt <- 0
        while (attempt < max.attempts) {
            attempt <- attempt + 1
            total.ploidy <- candidate.solutions$candidates$ploidy[cpi]
            p <- candidate.solutions$candidates$purity[cpi]
            # optimize purity within +/- 0.2 in the first attempt, then just
            # try to match the ploidy
            idxLocal <- which(abs(test.purity-p) < (0.1+attempt/10))
            test.purity.local <- test.purity[idxLocal]
            prior.purity.local <- prior.purity[idxLocal]
             
            flog.info("Testing local optimum %i/%i at purity %.2f and total ploidy %.2f...", 
                cpi, nrow(candidate.solutions$candidates), p, total.ploidy)
            
            subclonal <- rep(FALSE, nrow(seg))
            old.llik <- -1
            cnt.llik.equal <- 0
            C.likelihood <- matrix(ncol = length(test.num.copy) + 1, nrow = nrow(seg))
            colnames(C.likelihood) <- c(test.num.copy, "Subclonal")
            sd.seg.orig <- sd.seg
            for (iter in seq_len(iterations)) {
                flog.debug("Iteration %i, Purity %.2f, total ploidy %.2f, ploidy %.2f, likelihood %.3f, offset %.3f error %.3f",
                    iter, p, total.ploidy,  weighted.mean(C, li), llik, mean(log.ratio.offset), sd.seg)
                # test for convergence
                if (abs(old.llik - llik) < 0.0001) {
                  cnt.llik.equal <- cnt.llik.equal + 1
                }
                
                old.llik <- llik
                if (cnt.llik.equal > 3) 
                  break
                subclonal.f <- length(unlist(exon.lrs[subclonal]))/length(unlist(exon.lrs))
                # should not happen, but sometimes does for very unlikely local optima.
                if (subclonal.f > 0) flog.debug("Subclonal %.2f", subclonal.f)
                if (subclonal.f > max.non.clonal + 0.1) 
                  break
                if (iter == 1) 
                  log.ratio.offset <- .sampleOffsetFast(test.num.copy, seg, exon.lrs, 
                    sd.seg, p, C, total.ploidy, max.exon.ratio, simulated.annealing, 
                    log.ratio.calibration)
                
                # in the first iteration, we do not have integer copy numbers yet (corresponding
                # to local optima purity/ploidy)
                if (iter > 1) {
                  # calculate posterior probabilities of all requested purities
                  total.ploidy <- p * (sum(li * (C)))/sum(li) + (1 - p) * 2  #ploidy
                  px.rij <- lapply(test.purity.local, function(px) vapply(which(!is.na(C)), 
                    function(i) .calcLlikSegment(subclonal = subclonal[i], lr = exon.lrs[[i]] + 
                      log.ratio.offset[i], sd.seg = sd.seg, p = px, Ci = C[i], total.ploidy = total.ploidy, 
                      max.exon.ratio = max.exon.ratio), double(1)))
                  px.rij.s <- vapply(px.rij, sum, na.rm = TRUE, double(1)) + 
                      log(prior.purity.local)
                  
                  if (simulated.annealing) 
                    px.rij.s <- px.rij.s * exp(iter/4)
                  
                  px.rij.s <- exp(px.rij.s - max(px.rij.s))
                  # Gibbs sample purity
                  p <- test.purity.local[min(which(runif(n = 1, min = 0, max = sum(px.rij.s)) <= 
                    cumsum(px.rij.s)))]
                  total.ploidy <- p * (sum(li * (C)))/sum(li) + (1 - p) * 2
                  # Gibbs sample offset
                  if (iter > 2) {
                    log.ratio.offset <- .sampleOffset(subclonal, seg, exon.lrs, sd.seg, 
                      p, C, total.ploidy, max.exon.ratio, simulated.annealing, iter, 
                      log.ratio.calibration)
                    sd.seg <- .sampleError(subclonal, seg, exon.lrs, sd.seg.orig, 
                      p, C, total.ploidy, max.exon.ratio, simulated.annealing, iter, 
                      log.ratio.calibration, log.ratio.offset)
                   }
                }
                
                if (!sum(!is.na(C))) {
                  .stopRuntimeError("Could not assign integer copy numbers", " to segments.")
                }
                
                # calculate the log-liklihood of purity and integer copy numbers plus clonal vs
                # subclonal status
                llik <- sum(vapply(which(!is.na(C)), function(i) .calcLlikSegment(subclonal = subclonal[i], 
                  lr = exon.lrs[[i]] + log.ratio.offset[i], sd.seg = sd.seg, p = p, 
                  Ci = C[i], total.ploidy = total.ploidy, max.exon.ratio = max.exon.ratio), 
                  double(1)))
                
                if (is.na(llik)) {
                  .stopRuntimeError("Could not calculate copy number ", "log-likelihood for purity ", 
                    p, " and total ploidy ", total.ploidy, ".")
                }
                
                for (i in seq_len(nrow(seg))) {
                  # Gibbs sample copy number Step 1: calculate log-likelihoods of fits In the first
                  # iteration, we do not have the integer copy numbers yet, so calculate ploidy
                  # only when we have it next time. Now, use the ploidy from the candidate
                  # solution.
                  if (iter > 1) 
                    total.ploidy <- p * (sum(li * (C)))/sum(li) + (1 - p) * 2
                  
                  p.rij <- vapply(test.num.copy, function(Ci) .calcLlikSegment(subclonal = FALSE, 
                    lr = exon.lrs[[i]] + log.ratio.offset[i], sd.seg = sd.seg, p = p, 
                    Ci = Ci, total.ploidy = total.ploidy, max.exon.ratio = max.exon.ratio), 
                    double(1))
                  
                  # calculate tumor ploidy for all possible copy numbers in this segment
                  ploidy <- vapply(test.num.copy, function(Ci) (sum(li[-i] * (C[-i])) + 
                    li[i] * Ci)/sum(li), double(1))
                  
                  # set probability to zero if ploidy is not within requested range
                  log.prior.ploidy <- log(ifelse(ploidy < min.ploidy | ploidy > max.ploidy, 
                    0, 1))
                  if (iter > 1) 
                    p.rij <- p.rij + log.prior.ploidy

                  liChr <- li
                  idxOtherChrs <- which(seg$chrom != seg$chrom[i])
                  # with small panels, we cannot avoid too long segments and thus
                  # allow chromosomes with lots of deletions in this case
                  if (sum(seg$num.mark[-idxOtherChrs], na.rm = TRUE) > 500) {
                    liChr[idxOtherChrs] <- 0
                  }
                  frac.homozygous.loss <- vapply(test.num.copy, function(Ci) (sum(liChr[-i] * 
                    ifelse(C[-i] < 0.5, 1, 0)) + liChr[i] * ifelse(Ci < 0.5, 1, 0))/sum(liChr), 
                    double(1))
                  if (li[i] > max.homozygous.loss[2] && test.num.copy[1]<1) {
                       frac.homozygous.loss[1] <- 1
                  }

                  log.prior.homozygous.loss <- log(ifelse(frac.homozygous.loss > 
                    max.homozygous.loss[1], 0, 1))
                  if (iter > 1) 
                    p.rij <- p.rij + log.prior.homozygous.loss
                  
                  # model sub clonal state with a uniform distribution
                  p.rij <- c(p.rij, .calcLlikSegmentSubClonal(exon.lrs[[i]] + log.ratio.offset[i], 
                    max.exon.ratio))
                  
                  C.likelihood[i, ] <- exp(p.rij - max(p.rij))
                  
                  if (simulated.annealing) 
                    p.rij <- p.rij * exp(iter/4)
                  
                  # Because we are in log-space, sample relative to most likely fit
                  p.rij <- exp(p.rij - max(p.rij))
                  # Now Gibbs sample best fit
                  z <- runif(n = 1, min = 0, max = sum(p.rij))
                  if (is.na(z)) {
                    message(paste("Iter:", iter, "i:", i, "ploidy:", paste(ploidy, 
                      collapse = "/"), "p.rij:", paste(p.rij)))
                    .stopRuntimeError("Could not fit SCNAs.")
                  }
                  id <- min(which(z <= cumsum(p.rij)))
                  opt.C <- (2^(seg$seg.mean + log.ratio.offset) * total.ploidy)/p - 
                    ((2 * (1 - p))/p)
                  opt.C[opt.C < 0] <- 0
                  # if the sub-clonal event is a homozygous loss, we might reach our limit 
                  if (id > length(test.num.copy) && opt.C[i] <= 0 && is.infinite(min(log.prior.homozygous.loss,na.rm=TRUE))) {
                    C[i] <- 1
                    subclonal[i] <- FALSE             
                  } else if (id > length(test.num.copy)) {
                    # optimal non-integer copy number
                    C[i] <- opt.C[i]
                    subclonal[i] <- TRUE
                  } else if (test.num.copy[id] == max(test.num.copy) && 
                             opt.C[i] > test.num.copy[id] + 1.5) {
                    # rounded ideal integer copy number is max copy number + 2   
                    C[i] <- opt.C[i]
                    subclonal[i] <- TRUE
                  } else {
                    C[i] <- test.num.copy[id]
                    subclonal[i] <- FALSE
                  }
                }
            }
            if (subclonal.f < max.non.clonal && abs(total.ploidy - candidate.solutions$candidates$ploidy[cpi]) < 
                1) 
                break
            log.ratio.calibration <- log.ratio.calibration + 0.25
            if (attempt < max.attempts) {
                flog.info("Recalibrating log-ratios...")
            }
        }
        seg.adjusted <- seg
        seg.adjusted$C <- C
        seg.adjusted$size <- li
        list(log.likelihood = llik, purity = p, ploidy = weighted.mean(C, li), 
             ML.C = C, opt.C = opt.C, ML.Subclonal = subclonal, total.ploidy = total.ploidy,
             seg = seg.adjusted, C.likelihood = C.likelihood, fraction.subclonal = subclonal.f,
             fraction.homozygous.loss = sum(li[which(C < 0.01)])/sum(li),
             log.ratio.offset = log.ratio.offset, 
             log.ratio.sdev = sd.seg, SA.iterations = iter, candidate.id = cpi)
    }

    .fitSolution <- function(sol) {
        SNV.posterior <- NULL
        p <- sol$purity
        if (!is.null(vcf.file)) {
            flog.info("Fitting variants for local optimum %i/%i...", 
                sol$candidate.id, nrow(candidate.solutions$candidates), p)
        }
        if (sol$fraction.subclonal > max.non.clonal) {
            .stopRuntimeError(".fitSolution received high subclonal solution.")
        }
        
        gene.calls <- .getGeneCalls(sol$seg, tumor, log.ratio, fun.focal, 
            args.focal, chr.hash)

        if (!is.null(vcf.file)) {
            if (post.optimize) {
                idx <- c(match(p, test.purity), 
                    (max(1, match(p, test.purity) - 4)):(min(length(test.purity), 
                     match(p, test.purity) + 4)))
                idx <- idx[!duplicated(idx)]
                tp <- test.purity[idx]
                pp <- prior.purity[idx]
            } else {
                tp <- p
                pp <- 1
            }
            cont.rate <- prior.contamination
            .fitSNV <- function(tp, pp) {
                .fitSNVp <- function(px, cont.rate=prior.contamination) {
                    flog.info("Fitting variants for purity %.2f, tumor ploidy %.2f and contamination %.2f.", 
                        px, weighted.mean(sol$ML.C, sol$seg$size), cont.rate)
                  
                  .calcSNVLLik(vcf, tumor.id.in.vcf, ov, px, test.num.copy, 
                    sol$C.likelihood, sol$ML.C, sol$opt.C, median.C = median(rep(sol$ML.C, sol$seg$num.mark)), 
                    snv.model = model, prior.somatic, mapping.bias,
                    snv.lr, sampleid, cont.rate = cont.rate, prior.K = prior.K, 
                    max.coverage.vcf = max.coverage.vcf, non.clonal.M = non.clonal.M, 
                    model.homozygous = model.homozygous, error = error, 
                    max.mapping.bias = max.mapping.bias, max.pon = max.pon,
                    min.variants.segment = min.variants.segment)
                }

                res.snvllik <- lapply(tp[1], .fitSNVp)
                if (post.optimize && length(tp)>1) {
                    GoF <- .getGoF(list(SNV.posterior=res.snvllik[[1]]))
                    idx <- 1
                    if (GoF < (min.gof-0.05) && speedup.heuristics > 0) {
                        flog.info("Poor goodness-of-fit (%.3f). Skipping post-optimization.", GoF)
                    } else if (.calcFractionBalanced(res.snvllik[[1]]$posteriors) > 0.8 && 
                        weighted.mean(sol$ML.C, sol$seg$size) > 2.7 && speedup.heuristics > 0) {
                        flog.info("High ploidy solution in highly balanced genome. Skipping post-optimization.")
                    } else if (.isRareKaryotype(weighted.mean(sol$ML.C, sol$seg$size)) && speedup.heuristics > 0) {
                        flog.info("Rare karyotype solution. Skipping post-optimization.")
                    } else {
                        res.snvllik <- c(res.snvllik, lapply(tp[-1], .fitSNVp))
                      px.rij <- lapply(tp, function(px) vapply(which(!is.na(C)), function(i) .calcLlikSegment(subclonal = sol$ML.Subclonal[i], 
                        lr = exon.lrs[[i]] + sol$log.ratio.offset[i], sd.seg = sol$log.ratio.sdev, p = px, 
                        Ci = sol$ML.C[i], total.ploidy = px * (sum(sol$seg$size * sol$ML.C))/sum(sol$seg$size) + (1 - 
                          px) * 2, max.exon.ratio = max.exon.ratio), double(1)))
                      
                      px.rij.s <- vapply(px.rij, sum, na.rm = TRUE, double(1)) + 
                          log(pp) + vapply(res.snvllik, 
                        function(x) x$llik, double(1))
                      idx <- which.max(px.rij.s)
                  }
                } else {
                  idx <- 1
                }
                p <- tp[idx]
                flog.info("Optimized purity: %.2f", p)
                SNV.posterior <- res.snvllik[[idx]]
                list(p = p, SNV.posterior = SNV.posterior)
            }
            fitRes <- .fitSNV(tp, pp)
            sol$purity <- fitRes$p
            SNV.posterior <- fitRes$SNV.posterior
        }
         
        c(sol, list(
            C.posterior = data.frame(sol$C.likelihood/rowSums(sol$C.likelihood), ML.C =
            sol$ML.C, Opt.C = sol$opt.C, ML.Subclonal = sol$ML.Subclonal),
            SNV.posterior = SNV.posterior,
             fraction.balanced = 
            .calcFractionBalanced(SNV.posterior$posteriors),
            gene.calls = gene.calls,
            failed = FALSE))
    }
    
    # assign integer copy numbers to segments using SA 
    if (is.null(BPPARAM)) {
        results <- lapply(seq_len(nrow(candidate.solutions$candidates)), .optimizeSolution)
    } else {
        results <- BiocParallel::bplapply(seq_len(nrow(candidate.solutions$candidates)), 
                                          .optimizeSolution, BPPARAM = BPPARAM)
    }    
    # rank and then delete lower ranked similar solution
    results <- .rankResults(results)
    nBefore <- length(results)
    results <- .filterDuplicatedResults(results, purity.cutoff = 0.05)
    # bring back in original order for progress output
    results <- results[order(vapply(results, function(sol) sol$candidate.id, integer(1)))]

    if (length(results) < nBefore) { 
        flog.info("Skipping %i solutions that converged to the same optima.",
                  nBefore - length(results))
    }
    idxFailed <- vapply(results, function(sol) 
                        sol$fraction.subclonal > max.non.clonal, logical(1))
    if (sum(is.na(idxFailed))) .stopRuntimeError("NAs in fraction.subclonal.")
    if (sum(idxFailed)) {
        flog.info("Skipping %i solutions exceeding max.non.clonal (%.2f).",
                  sum(idxFailed), max.non.clonal)
    }
    results <- results[which(!idxFailed)]

    # fit SNVs
    if (is.null(BPPARAM) || is.null(vcf.file)) {
        results <- lapply(results, .fitSolution)
    } else {
        results <- BiocParallel::bplapply(results, 
                                          .fitSolution, BPPARAM = BPPARAM)
    }

    results <- .rankResults(results)
    results <- .filterDuplicatedResults(results)

    # If necessary, try to estimate contamination
    #--------------------------------------------------------------------------
    cont.rate <- prior.contamination
    
    for (i in seq_along(results)) {
        if (grepl("CONTAMINATION", vcf.filtering$flag_comment)) {
            cont.rate <- .estimateContamination(
                results[[i]]$SNV.posterior$posteriors, 
                max.mapping.bias)
            if (cont.rate > prior.contamination) {
                flog.info("Initial guess of contamination rate: %.3f", cont.rate)
            }    
        }
        ## optimize contamination. we just re-run the fitting if necessary (
        if (grepl("CONTAMINATION", vcf.filtering$flag_comment)) {
            if (cont.rate > prior.contamination) {
                flog.info("Optimizing contamination rate of optimum %i/%i...", 
                    i, length(results))
                     
                res.snvllik <-
                    .calcSNVLLik(vcf, tumor.id.in.vcf,
                          ov, results[[i]]$purity, test.num.copy, results[[i]]$C.likelihood, 
                          results[[i]]$C.posterior$ML.C,
                          results[[i]]$C.posterior$Opt.C,
                          median.C = median(rep(results[[i]]$seg$C, results[[i]]$seg$num.mark)),
                          snv.model = model, prior.somatic, mapping.bias,
                          snv.lr, sampleid, cont.rate = cont.rate, prior.K = prior.K,
                          max.coverage.vcf = max.coverage.vcf, non.clonal.M = non.clonal.M,
                          model.homozygous = model.homozygous, error = error,
                          max.mapping.bias = max.mapping.bias, max.pon = max.pon,
                          min.variants.segment = min.variants.segment)
                results[[i]]$SNV.posterior <- res.snvllik
            }    
            cont.rate <- .estimateContamination(
                        results[[i]]$SNV.posterior$posteriors,
                                    max.mapping.bias)
                    flog.info("Optimized contamination rate: %.3f", cont.rate)
            results[[i]]$SNV.posterior$posterior.contamination <- cont.rate
        }
    }
    # re-order after contamination optimization
    if (length(results)>1) results <- .rankResults(results)

    if (length(results) && 
        !is.null(results[[1]]$SNV.posterior$posterior.contamination) &&
        results[[1]]$SNV.posterior$posterior.contamination < 0.001 &&
        !is.na(vcf.filtering$flag_comment) &&
        vcf.filtering$flag_comment == "POTENTIAL SAMPLE CONTAMINATION") {
        vcf.filtering$flag <- FALSE
        vcf.filtering$flag_comment <- ""   
    }    

    results <- .flagResults(results, max.non.clonal = max.non.clonal, max.logr.sdev = max.logr.sdev, 
        logr.sdev = sd.seg, max.segments = max.segments, min.gof = min.gof, flag = vcf.filtering$flag, 
        flag_comment = vcf.filtering$flag_comment, dropout = dropoutWarning, 
        use.somatic.status = args.filterVcf$use.somatic.status, 
        model.homozygous = model.homozygous)
    
    if (length(results) < 1) {
        flog.warn("Could not find valid purity and ploidy solution.")
    }
    .logFooter()
    list(
        candidates = candidate.solutions, 
        results = results, 
        input = list(tumor = tumor.coverage.file, normal = normal.coverage.file, 
            log.ratio = GRanges(normal[, 1], on.target=normal$on.target, log.ratio = log.ratio), 
            log.ratio.sdev = sd.seg, vcf = vcf, sampleid = sampleid, 
            test.num.copy = test.num.copy,
            sex = sex, sex.vcf = sex.vcf, chr.hash = chr.hash, centromeres = centromeres,
            args=list(
                filterVcf = args.filterVcf[vapply(args.filterVcf, object.size, double(1)) < 1000],
                filterIntervals = args.filterIntervals[vapply(args.filterIntervals, object.size, double(1)) < 1000])
        )
    )
}