Bioconductor Code: PureCN

Browse code

Support for processMultipleSamples in PureCN.R via --additionaltumors.

Markus Riester authored on 11/07/2020 21:07:06
Showing 8 changed files

DESCRIPTION index 554bb1a..0bd5257 100644
NEWS index ffb4463..5a08af7 100755
R/createNormalDatabase.R index 0791b9d..588c8c5 100644
R/processMultipleSamples.R index 9ddbade..6bcc53d 100644
inst/extdata/Dx.R index 937b8ef..8665764 100644
inst/extdata/PureCN.R index d36c218..2c453fd 100644
tests/testthat/test_processMultipleSamples.R index 935c1a4..c9575d0 100644
vignettes/Quick.Rmd index 278c9ff..2fea815 100755

History View file @ 167ea8a

@@ -2,8 +2,8 @@ Package: PureCN
                      Type: Package
                      Title: Copy number calling and SNV classification using
                          targeted short read sequencing
                     -Version: 1.19.5
                     -Date: 2020-07-06
                     +Version: 1.19.6
                     +Date: 2020-07-11
                      Authors@R: c(person("Markus", "Riester",
                                          role = c("aut", "cre"),
                                          email = "[email protected]",

NEWS

History View file @ 167ea8a

@@ -4,6 +4,8 @@ Changes in version 1.20.0
                      NEW FEATURES
                          o Support for GATK4 GenomicsDB import for mapping bias calculation
                     +    o Added --additionaltumors to PureCN.R to provide coverage files
                     +      from additional biopsies from the same patient when available
                      SIGNIFICANT USER-VISIBLE CHANGES
@@ -20,6 +22,9 @@ SIGNIFICANT USER-VISIBLE CHANGES
                          o Made calculateIntervalWeights defunct
                          o Changed default of min.normals in calculateMappingBiasVcf/Gatk4 to 1
                            from 2
                     +    o Changed default of --signature_databases to
                     +      "signatures.exome.cosmic.v3.may2019" (v3 instead of v2)
                     +    o Now warn if recommended  -funsegmentation is not used
                      BUGFIXES

R/createNormalDatabase.R

History View file @ 167ea8a

@@ -303,7 +303,12 @@ calculateTangentNormal <- function(tumor.coverage.file, normalDB,
+                     }
                      .readNormals <- function(normal.coverage.files) {
                     -    normals <- lapply(normal.coverage.files, readCoverageFile)
                     +    normals <- lapply(normal.coverage.files, function(x) {
                     +        if (is(x, "character")) {
                     +            return(readCoverageFile(normalizePath(x)))
                     +        }
                     +        return(x)
                     +    })
                          # check that all files used the same interval file.
                          for (i in seq_along(normals)) {

R/processMultipleSamples.R

History View file @ 167ea8a

@@ -61,7 +61,6 @@ processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB,
                          if (!requireNamespace("copynumber", quietly = TRUE)) {
                              .stopUserError("processMultipleSamples requires the copynumber package.")
+                         }
                     -    tumor.coverage.files <- normalizePath(tumor.coverage.files)
                          tumors <- lapply(.readNormals(tumor.coverage.files),
                              calculateTangentNormal, normalDB, num.eigen = num.eigen)
@@ -84,7 +83,10 @@ processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB,
                          intervalsUsed <- .filterIntervalsChrHash(intervalsUsed, tumors[[1]], chr.hash)
                          centromeres <- .getCentromerePositions(centromeres, genome,
                                  if (is.null(tumors[[1]])) NULL else seqlevelsStyle(tumors[[1]]))
+                    -
                     +    if (is.null(centromeres)) {
                     +        .stopUserError("Cannot find centromeres for ", genome,
                     +            ". Provide them manually or select a supported genome.")
                     +    }
                          armLocations <- .getArmLocations(tumors[[1]], chr.hash, centromeres)
                          armLocationsGr <- GRanges(armLocations)
                          arms <- armLocationsGr$arm[findOverlaps(tumors[[1]], armLocationsGr, select="first")]

inst/extdata/Dx.R

History View file @ 167ea8a

@@ -21,7 +21,7 @@ option_list <- list(
                          make_option(c("--signatures"), action = "store_true", default = FALSE,
                              help="Attempt the deconstruction of COSMIC signatures (requires deconstructSigs package)"),
                          make_option(c("--signature_databases"), action = "store", type = "character",
                     -        default = "signatures.cosmic",
                     +        default = "signatures.exome.cosmic.v3.may2019",
                              help = "Use the specified signature databases provided by deconstrucSigs. To test multiple databases, provide them : separated [%default]."),
                          make_option(c("--out"), action = "store", type = "character",
                              default = NULL,

inst/extdata/PureCN.R

History View file @ 167ea8a

@@ -24,6 +24,8 @@ option_list <- list(
                                      default = NULL, help = "Input: Segmentation file"),
                          make_option(c("--logratiofile"), action = "store", type = "character",
                                      default = NULL, help = "Input: Log2 copy number ratio file"),
                     +    make_option(c("--additionaltumors"), action = "store", type = "character",
                     +                default = NULL, help = "Input: tumor coverages from additional biopsies from the SAME patient, GC-normalized"),
                          make_option(c("--sex"), action = "store", type = "character",
                              default = formals(PureCN::runAbsoluteCN)$sex[[2]],
                              help = "Input: Sex of sample. ? (detect), diplod (non-diploid chromosomes removed), F or M [default %default]"),
@@ -181,6 +183,16 @@ if (Sys.getenv("PURECN_DEBUG") != "") {
                          debug <- TRUE
+                     }
                     +.checkFileList <- function(file) {
                     +    files <- read.delim(file, as.is = TRUE, header = FALSE)[,1]
                     +    numExists <- sum(file.exists(files), na.rm = TRUE)
                     +    if (numExists < length(files)) {
                     +        stop("File not exists in file ", file)
                     +    }
                     +    files
                     +}
+                    +
+                    +
                      ### Run PureCN ----------------------------------------------------------------
                      if (file.exists(file.rds) && !opt$force) {
@@ -190,7 +202,6 @@ if (file.exists(file.rds) && !opt$force) {
                          if (is.null(sampleid)) sampleid <- ret$input$sampleid
                      } else {
                          if (!is.null(opt$normaldb)) {
                     -        #if (!is.null(seg.file)) stop("normalDB and segfile do not work together.")
                              normalDB <- readRDS(opt$normaldb)
                              if (!is.null(normal.coverage.file)) {
                                  flog.warn("Both --normal and --normalDB provided. normalDB will NOT be used for coverage denoising. You probably do not want this.")
@@ -213,21 +224,46 @@ if (file.exists(file.rds) && !opt$force) {
                          file.log <- paste0(out, ".log")
                          pdf(paste0(out, "_segmentation.pdf"), width = 10, height = 11)
                     +    if (!is.null(opt$additionaltumors)) {
                     +        if (!is.null(seg.file)) {
                     +            stop("--additionaltumors overwrites --segfile")
                     +        }
                     +        seg.file <- paste0(out, "_multisample.seg")
                     +        if (grepl(".list$", opt$additionaltumors)) {
                     +            additional.tumors <- .checkFileList(opt$additionaltumors)
                     +        } else {
                     +            additional.tumors <- opt$additionaltumors
                     +        }
                     +        multi.seg <- processMultipleSamples(
                     +            c(list(tumor.coverage.file), as.list(additional.tumors)),
                     +            sampleids = c(sampleid, paste(sampleid,
                     +                seq_along(additional.tumors) + 1, sep = "_")),
                     +            normalDB = normalDB, genome = opt$genome, verbose = debug)
                     +        write.table(multi.seg, seg.file, row.names = FALSE, sep = "\t")
                     +    }
                          af.range <- c(opt$minaf, 1 - opt$minaf)
                          test.purity <- seq(opt$minpurity, opt$maxpurity, by = 0.01)
                     +    uses.recommended.fun <- FALSE
                     +    recommended.fun <- if (is.null(seg.file)) "PSCBS" else "Hclust"
                          fun.segmentation <- segmentationCBS
                          if (opt$funsegmentation != "CBS") {
                              if (opt$funsegmentation == "PSCBS") {
                                  fun.segmentation <- segmentationPSCBS
                     +            if (is.null(seg.file)) uses.recommended.fun <- TRUE
                              } else if (opt$funsegmentation == "Hclust") {
                                  fun.segmentation <- segmentationHclust
                     +            if (!is.null(seg.file)) uses.recommended.fun <- TRUE
                              } else if (opt$funsegmentation == "none") {
                                  fun.segmentation <- function(seg, ...) seg
                              } else {
                                  stop("Unknown segmentation function")
+                             }
+                         }
                     +    if (!uses.recommended.fun) {
                     +        flog.warn("Recommended to provide --funsegmentation %s.", recommended.fun)
                     +    }
+                    +
                          mutect.ignore <- eval(formals(PureCN::filterVcfMuTect)$ignore)
                          if (opt$error < formals(PureCN::runAbsoluteCN)$error && !is.null(opt$statsfile)) {
                              flog.info("Low specified error, will keep fstar_tumor_lod flagged variants")
@@ -255,6 +291,7 @@ if (file.exists(file.rds) && !opt$force) {
+                             }
                              log.ratio <- log.ratio$log.ratio
+                         }
+                    +
                          ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file,
                                  tumor.coverage.file = tumor.coverage.file, vcf.file = opt$vcf,
                                  sampleid = sampleid, plot.cnv = TRUE,

tests/testthat/test_processMultipleSamples.R

History View file @ 167ea8a

@@ -20,6 +20,12 @@ test_that("example output correct", {
                      			 normalDB = normalDB,
                      			 genome = "hg19")
                          expect_equal(c("Sample1", "Sample2"), levels(seg[,1]))
                     +	seg2 <- processMultipleSamples(
                     +             list(tumor.coverage.files[1],readCoverageFile(tumor.coverage.files[2])),
                     +			 sampleids = c("Sample1", "Sample2"),
                     +			 normalDB = normalDB,
                     +			 genome = "hg38", plot.cnv = FALSE)
                     +    expect_equal(c("Sample1", "Sample2"), levels(seg2[,1]))
                          seg.file <- tempfile(fileext = ".seg")
                          write.table(seg, seg.file, row.names = FALSE, sep = "\t")
                          vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN")
@@ -30,4 +36,8 @@ test_that("example output correct", {
                              genome = "hg19", min.ploidy = 1.5, max.ploidy = 2.1,
                              test.purity = seq(0.4, 0.7, by = 0.05), sampleid = "Sample1")
                          expect_equal(0.65, ret$results[[1]]$purity)
                     +	expect_error(processMultipleSamples(tumor.coverage.files,
                     +			 sampleids = c("Sample1", "Sample2"),
                     +			 normalDB = normalDB,
                     +			 genome = "hg20"), "centromere")
                      })

vignettes/Quick.Rmd

History View file @ 167ea8a

@@ -28,11 +28,9 @@ library(BiocStyle)
                      ## Update from previous stable versions
                     -`r Biocpkg("PureCN")` is fully backward compatible with input generated by
                     -versions 1.10, 1.12, 1.14, 1.16 and 1.18. However, 1.16 slightly changed the
                     -mapping bias database and incorporated the interval weights into the database directly.
                     -Simply re-create this RDS file to take advantage of the new features in case
                     -you upgrade from earlier versions:
                     +`r Biocpkg("PureCN")` is backward compatible with input generated by
                     +versions 1.16 and 1.18. For versions 1.8 to 1.14, please re-run `NormalDB.R`
                     +(see also below):
                      ```
                      $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
@@ -40,14 +38,10 @@ $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
                          --genome hg19 --normal_panel $NORMAL_PANEL --assay agilent_v6
                      ```
                     -`r Biocpkg("PureCN")` 1.10 introduced a completely new normal database format with
                     -several important improvements such as not splitting the database by sample sex
                     -for the normalization of autosomes. It is therefore necessary (not only recommended)
                     -to re-run the `NormalDB.R` when upgrading from version 1.8.
+                    -
                      For upgrades from version 1.6, we highly recommend starting from scratch
                      following this tutorial.
+                    +
                      ## Installation
                      For the command line scripts described in this tutorial, we will need to
@@ -240,18 +234,20 @@ $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
                          --coveragefiles example_normal.list \
                          --genome hg19 --assay agilent_v6
                     -# When normal panel VCF is available (highly recommended for unmatched samples)
                     +# When normal panel VCF is available (highly recommended for
                     +# unmatched samples)
                      $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
                          --coveragefiles example_normal.list \
                     -    --genome hg19 --normal_panel $NORMAL_PANEL \
                     +    --normal_panel $NORMAL_PANEL \
                     +    --genome hg19 \
                          --assay agilent_v6
                      # For a Mutect2/GATK4 normal panel GenomicsDB (experimental)
                      $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
                          --coveragefiles example_normal.list \
                     -    --genome hg19 $GENOMICSDB-WORKSPACE-PATH/pon_db \
                     +    --normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \
                     +    --genome hg19 \
                          --assay agilent_v6
+                    -
                      ```
                      Important recommendations:
@@ -461,9 +457,9 @@ Important recommendations:
                      ## Recommended _GATK4_ usage
                      ```
                     -# Recommended: Provide a normal panel GenomicsDB to remove mapping biases,
                     -# pre-compute position-specific bias for much faster runtimes with large panels
                     -# This needs to be done only once for each assay.
                     +# Recommended: Provide a normal panel GenomicsDB to remove mapping
                     +# biases, pre-compute position-specific bias for much faster runtimes
                     +# with large panels. This needs to be done only once for each assay.
                      Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
                          --normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \
                          --assay agilent_v6 --genome hg19 --force
@@ -474,8 +470,7 @@ Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID  \
                          --logratiofile $OUT/$SAMPLEID/${SAMPLEID}.denoisedCR.tsv \
                          --segfile $OUT/$SAMPLEID/${SAMPLEID}.modelFinal.seg \
                          --mappingbiasfile $OUT_REF/mapping_bias_agilent_v6_hg19.rds \
                     -    --vcf ${SAMPLEID}_mutect.vcf \
                     -    --statsfile ${SAMPLEID}_mutect_stats.txt \
                     +    --vcf ${SAMPLEID}_mutect2_filtered.vcf \
                          --snpblacklist hg19_simpleRepeats.bed \
                          --genome hg19 \
                          --funsegmentation Hclust \
@@ -495,17 +490,16 @@ and mutational signatures.
                      grep CALLABLE ${SAMPLEID}_callable_status.bed > \
                          ${SAMPLEID}_callable_status_filtered.bed
                     -# Only count mutations in callable regions, also subtract what was ignored
                     -# in PureCN.R via --snpblacklist, like simple repeats, from the mutation per
                     -# megabase calculation
                     +# Only count mutations in callable regions, also subtract what was
                     +# ignored in PureCN.R via --snpblacklist, like simple repeats, from the
                     +# mutation per megabase calculation
                      # Also search for the COSMIC mutation signatures
                      # (http://cancer.sanger.ac.uk/cosmic/signatures)
                      Rscript $PureCN/Dx.R --out $OUT/$SAMPLEID/$SAMPLEID \
                          --rds $OUT/SAMPLEID/${SAMPLEID}.rds \
                          --callable ${SAMPLEID}_callable_status_filtered.bed \
                          --exclude hg19_simpleRepeats.bed \
                     -    --signatures \
                     -    --signature_databases signatures.exome.cosmic.v3.may2019
                     +    --signatures
                      # Restrict mutation burden calculation to coding sequences
                      Rscript $PureCN/FilterCallableLoci.R --genome hg19 \
@@ -595,6 +589,7 @@ Argument name          | Corresponding PureCN argument | PureCN function
                      `--normaldb`         | `normalDB` (serialized with `saveRDS`) | `calculateTangentNormal`, `filterTargets`
                      `--segfile`          | `seg.file`           | `runAbsoluteCN`
                      `--logratiofile`     | `log.ratio`          | `runAbsoluteCN`
                     +`--additionaltumors` | `tumor.coverage.files` | `processMultipleSamples`
                      `--sex`              | `sex`                | `runAbsoluteCN`
                      `--genome`           | `genome`             | `runAbsoluteCN`
                      `--intervals`        | `interval.file`      | `runAbsoluteCN`