Bioconductor Code: SplicingGraphs

Raw Blame Patch Log History
### =========================================================================
### SplicingGraphs objects
### -------------------------------------------------------------------------


### With the 2-class design we choose below, the SplicingGraphs class does
### NOT extend the CompressedList class, but has the slot ('genes') that
### extends the CompressedList class. This avoids having the SplicingGraphs
### class inherit a very rich API (Vector + List), which would be a problem
### because many operations (like c() or relist()) would be broken, unless
### we re-implement them for SplicingGraphs objects. But: (a) that's a lot of
### work (the API is huge), and (b) we don't need those operations in the
### first place. All we need are: length(), names(), [, [[, elementNROWS(),
### and unlist().
### The GeneModel class is an internal class that is not intended to be
### exposed to the user. It's a list-like class where the elements are
### GRangesList objects, each of them representing a gene. It's currently
### defined exactly how the GRangesListList class would be if we decided to
### have one in the GenomicRanges package.

setClass("GeneModel",
    contains="CompressedList",
    representation(
        unlistData="GRangesList",
        elementMetadata="DataFrame"
    ),
    prototype(
        elementType="GRangesList"
    )
)

setClass("SplicingGraphs",
    representation(
        genes="GeneModel",
        in_by_tx="GRangesList",
        .bubbles_cache="environment"
    )
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### GeneModel validity.
###

.valid.GeneModel.names <- function(x)
{
    x_names <- names(x)
    if (is.null(x_names)) {
        if (length(x) == 1L)
            return(NULL)
        return("'x' must have names")
    }
    if (anyDuplicated(x_names))
        return("'x' has duplicated names")
    NULL
}

.valid.GeneModel.ex_by_tx <- function(x)
{
    ex_by_tx <- x@unlistData
    if (!is.null(names(ex_by_tx)))
        return("'x@unlistData' must be unnamed")
    ex_mcols <- mcols(ex_by_tx@unlistData)
    valid_exon_mcolnames(colnames(ex_mcols))
}

.valid.GeneModel <- function(x)
{
    c(.valid.GeneModel.names(x),
      .valid.GeneModel.ex_by_tx(x))
}

setValidity2("GeneModel", .valid.GeneModel)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### GeneModel API.
###

### From the CompressedList API:
### GeneModel objects inherit the CompressedList API i.e. anything that works
### on a CompressedList object works on a GeneModel object. But the only
### things we're using/supporting in the SplicingGraphs package are: length(),
### names(), [, [[, elementNROWS(), and unlist().

### From the GRanges API:
### We only need seqnames(), strand(), and seqinfo() from the GRanges API.

setMethod("seqnames", "GeneModel",
    function(x)
    {
        x_unlisted <- unlist(x)
        unlisted_seqnames <- commonSeqnames.GRangesList(x_unlisted)
        relisted_seqnames <- relist(unlisted_seqnames, x)
        ans <- commonSeqnames.RleList(relisted_seqnames)
        names(ans) <- names(x)
        ans
    }
)

setMethod("strand", "GeneModel",
    function(x)
    {
        x_unlisted <- unlist(x)
        unlisted_strand <- commonStrand.GRangesList(x_unlisted)
        relisted_strand <- relist(unlisted_strand, x)
        ans <- commonStrand.RleList(relisted_strand)
        names(ans) <- names(x)
        ans
    }
)

setMethod("seqinfo", "GeneModel",
    function(x) seqinfo(unlist(x, use.names=FALSE))
)

### The only GeneModel setter supported at the moment.
setReplaceMethod("isCircular", "GeneModel",
    function(x, value)
    {
        isCircular(x@unlistData) <- value
        x
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### GeneModel() constructor.
###
### Used only in this file.
###

### 'ex_by_tx' must be a *named* GRangesList object containing exons grouped
### by transcript. The names must be the gene ids, NOT the transcript ids or
### transcript names.
GeneModel <- function(ex_by_tx)
{
    ex_by_tx_names <- names(ex_by_tx)
    if (is.null(ex_by_tx_names)) {
        ans_partitioning <- PartitioningByEnd(length(ex_by_tx))
    } else {
        names(ex_by_tx) <- NULL
        ans_end <- end(Rle(ex_by_tx_names))
        ans_names <- ex_by_tx_names[ans_end]
        ans_partitioning <- PartitioningByEnd(ans_end, names=ans_names)
    }
    IRanges:::newCompressedList0(getClass("GeneModel"),
                                 ex_by_tx, ans_partitioning)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### SplicingGraphs validity.
###
### A SplicingGraphs object 'x' should be validated with:
###
###   validObject(x, complete=TRUE)
###

.valid.SplicingGraphs.in_by_tx <- function(x)
{
    x_in_by_tx <- x@in_by_tx
    x_in_by_tx_names <- names(x_in_by_tx)
    if (is.null(x_in_by_tx_names))
        return("'x@in_by_tx' must have names")
    x_ex_by_tx <- unlist(x@genes)
    if (length(x_in_by_tx) != length(x_ex_by_tx))
        return("'x@in_by_tx' must have the same length as 'unlist(x@genes)'")
    if (!identical(x_in_by_tx_names, names(x_ex_by_tx)))
        return("'x@in_by_tx' must have the same names as 'unlist(x@genes)'")
    if (!identical(elementNROWS(x_in_by_tx) + 1L,
                   elementNROWS(x_ex_by_tx))) {
        msg <- c("the shape of 'x@in_by_tx' is not compatible ",
                 "with the shape of 'unlist(x@genes)'")
        return(paste0(msg, collapse=""))
    }
    NULL
}

.valid.SplicingGraphs <- function(x)
{
    c(.valid.SplicingGraphs.in_by_tx(x))
}

setValidity2("SplicingGraphs", .valid.SplicingGraphs)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Restricted SplicingGraphs API.
###
### Here we only implement:
###   - a few core methods from the List API,
###   - seqnames(), strand(), seqinfo() from the GRanges API.
###

setMethod("length", "SplicingGraphs", function(x) length(x@genes))

setMethod("names", "SplicingGraphs", function(x) names(x@genes))

setMethod("[", "SplicingGraphs",
    function(x, i, j, ..., drop=TRUE)
    {
        if (!missing(j) || length(list(...)) > 0L)
            stop("invalid subsetting")
        if (missing(i))
            return(x)
        if (is.null(names(x)))
            stop("subsetting a SplicingGraphs with no names is not supported")
        x_genes <- x@genes
        seqx <- seq_along(x_genes)
        names(seqx) <- names(x_genes)
        i <- seqx[i]
        ii <- unlist(IRanges(PartitioningByEnd(x_genes))[i], use.names=FALSE)
        x@genes <- x_genes[i]
        x@in_by_tx <- x@in_by_tx[ii]
        x
    }
)

setMethod("[[", "SplicingGraphs",
    function (x, i, j, ...)
    {
        if (!missing(j) || length(list(...)) > 0L)
            stop("invalid subsetting")
        x@genes[[i]]
    }
)

setMethod("elementNROWS", "SplicingGraphs",
    function(x) elementNROWS(x@genes)
)

setMethod("unlist", "SplicingGraphs",
    function(x, recursive=TRUE, use.names=TRUE)
    {
        unlist(x@genes, recursive=recursive, use.names=use.names)
    }
)

setMethod("seqnames", "SplicingGraphs", function(x) seqnames(x@genes))

setMethod("strand", "SplicingGraphs", function(x) strand(x@genes))

setMethod("seqinfo", "SplicingGraphs", function(x) seqinfo(x@genes))

### isCircular<- and txweight<- are the only SplicingGraphs setters supported
### at the moment.
setReplaceMethod("isCircular", "SplicingGraphs",
    function(x, value)
    {
        isCircular(x@genes) <- isCircular(x@in_by_tx) <- value
        x
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### "show" method.
###

setMethod("show", "SplicingGraphs",
    function(object)
    {
        ngene <- length(object)
        ntx <- length(unlist(object, use.names=FALSE))
        cat(class(object), " object with ", ngene, " gene(s) ",
            "and ", ntx, " transcript(s)\n", sep="")
    }
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### SplicingGraphs() constructor
###

### 'exons_start' and 'exons_end' must be 2 integer vectors of the same length
### N representing the start and end positions of N exons that belong to the
### same gene. Returns 2 integer vectors of length N containing the splicing
### site ids assigned to the start and end positions, respectively.
.make_SSids <- function(exons_start, exons_end, on.minus.strand=FALSE)
{
    if (!is.numeric(exons_start))
        stop("'exons_start' must be an integer vector")
    if (!is.integer(exons_start))
        exons_start <- as.integer(exons_start)
    if (!is.numeric(exons_end))
        stop("'exons_end' must be an integer vector")
    if (!is.integer(exons_end))
        exons_end <- as.integer(exons_end)
    N <- length(exons_start)
    if (length(exons_end) != N)
        stop("'exons_start' and 'exons_end' must have the same length")
    ## splicing sites
    ssites <- data.frame(pos=c(exons_start, exons_end),
                         type=rep.int(1:2, c(N, N)))
    ssites_sm <- S4Vectors:::selfmatchIntegerPairs(ssites$pos, ssites$type)
    ## unique splicing sites
    ussites <- ssites[ssites_sm == seq_along(ssites_sm), , drop=FALSE]
    oo <- S4Vectors:::orderIntegerPairs(ussites$pos, ussites$type,
                                        decreasing=on.minus.strand)
    ussites <- ussites[oo, , drop=FALSE]
    SSid <- S4Vectors:::matchIntegerPairs(ssites$pos, ssites$type,
                                          ussites$pos, ussites$type)
    exons_start_SSid <- head(SSid, n=N)
    exons_end_SSid <- tail(SSid, n=N)
    list(start_SSid=exons_start_SSid, end_SSid=exons_end_SSid)
}

### 'gene' must be a GRangesList object containing the exons of a *single*
### gene grouped by transcript. More precisely, each top-level element
### in 'gene' contains the genomic ranges of the exons for a particular
### transcript of the gene.
### Should be able to deal with a GRangesList object of length 0 (i.e. a
### gene with no transcripts).
.setSplicingGraphInfo <- function(gene, check.introns=TRUE)
{
    if (!is(gene, "GRangesList"))
        stop("'gene' must be a GRangesList object")
    if (!isTRUEorFALSE(check.introns))
        stop("'check.introns' must be TRUE or FALSE")
    exons <- gene@unlistData
    common_strand <- commonStrand.GRanges(exons, what="exons in the gene")
    on.minus.strand <- common_strand == "-"
    if (check.introns && length(exons) != 0L) {
        ## We check that, within each transcript, exons are ordered from
        ## 5' to 3' with gaps of at least 1 nucleotide between them.
        ranges_by_tx <- ranges(gene)
        if (on.minus.strand)
            ranges_by_tx <- revElements(ranges_by_tx)
        if (!all(isNormal(ranges_by_tx)))
            stop("some transcripts in the gene don't have their exons ",
                 "ordered from 5' to 3' with gaps of at least 1 ",
                 "nucleotide between them (introns)")
    }

    ## Set splicing site ids.
    SSids <- .make_SSids(start(exons), end(exons),
                         on.minus.strand=on.minus.strand)
    exons_mcols <- mcols(exons)
    if (any(names(SSids) %in% colnames(exons_mcols))) {
        in_1_string <- paste(names(SSids), collapse=", ")
        stop("'unlist(gene)' already has metadata columns: ", in_1_string)
    }
    mcols(exons) <- cbind(mcols(exons), DataFrame(SSids))
    gene@unlistData <- exons
    gene_mcols <- mcols(gene)

    ## Set "tx_id" metadata col.
    if ("tx_id" %in% colnames(gene_mcols))
        stop("'gene' already has metadata column tx_id")
    tx_id <- names(gene)
    if (!is.null(tx_id))
        gene_mcols$tx_id <- tx_id

    ## Set "txpath" metadata col.
    if ("txpath" %in% colnames(gene_mcols))
        stop("'gene' already has metadata column txpath")
    if (on.minus.strand) {
        txpath <- rbind(SSids$end_SSid, SSids$start_SSid)
    } else {
        txpath <- rbind(SSids$start_SSid, SSids$end_SSid)
    }
    txpath_partitioning_end <- end(PartitioningByEnd(gene)) * 2L
    txpath_partitioning <- PartitioningByEnd(txpath_partitioning_end)
    names(txpath_partitioning) <- tx_id
    txpath <- relist(as.vector(txpath), txpath_partitioning)
    gene_mcols$txpath <- txpath

    mcols(gene) <- gene_mcols
    gene
}

### 'x' must be a GRangesList object containing the exons of one or more
### genes grouped by transcript. More precisely, each top-level element
### in 'x' contains the genomic ranges of the exons for a particular
### transcript. Typically 'x' will be obtained from a TxDb object
### 'txdb' with 'exonsBy(txdb, by="tx")'.
### 'grouping' is an optional object that represents the grouping by gene of
### the top-level elements (i.e. transcripts) in 'x'. It can be either:
###   (a) Missing (i.e. NULL). In that case, all the transcripts in 'x'
###       are considered to belong to the same gene and the SplicingGraphs
###       object returned by SplicingGraphs() will be unnamed.
###   (b) A list of integer or character vectors, or an IntegerList, or a
###       CharacterList object, of length the number of genes to process,
###       and where 'grouping[[i]]' is a vector of valid subscripts in 'x'
###       pointing to all the transcripts of the i-th gene.
###   (c) A factor, character vector, or integer vector, of length 'x'
###       with 1 level per gene.
###   (d) A named GRangesList object containing transcripts grouped by gene
###       i.e. each top-level element in 'grouping' contains the genomic ranges
###       of the transcripts for a particular gene. In that case, the grouping
###       is inferred from the tx_id (or alternatively tx_name) metadata
###       column of 'unlist(grouping)' and all the values in that column must
###       be in 'names(x)'.
###       If 'x' was obtained with 'exonsBy(txdb, by="tx")', then the
###       GRangesList object used for grouping would typically be obtained with
###       'transcriptsBy(txdb, by="gene")'.
###   (e) A data.frame or DataFrame with 2 character vector columns: a
###       gene_id column (factor, character vector, or integer vector),
###       and a tx_id (or alternatively tx_name) column. In that case, 'x'
###       must be named and all the values in the tx_id (or tx_name) column
###       must be in 'names(x)'.

.checkOrMakeUniqueGroupingNames <- function(grouping)
{
    grouping_names <- names(grouping)
    if (is.null(grouping_names)) {
        if (length(grouping) != 0L)
            warning("'grouping' is unnamed. Setting names on it (with ",
                    "'names(grouping) <- seq_along(grouping)') as artificial ",
                    "gene ids.")
        names(grouping) <- seq_along(grouping)
        return(grouping)
    }
    if (anyDuplicated(grouping_names))
        stop("'grouping' has duplicated names")
    if (any(c(NA, "") %in% grouping_names))
        stop("'grouping' names contains NAs or \"\" (empty string)")
    grouping
}

### Returns a list or IntegerList or CharacterList. Always *named* with the
### gene ids.
.normargGrouping <- function(grouping, x)
{
    ## (b)
    if (is.list(grouping) || is(grouping, "IntegerList")
     || is(grouping, "CharacterList")) {
        return(.checkOrMakeUniqueGroupingNames(grouping))
    }
    ## (c)
    if (is.factor(grouping) || is.character(grouping) || is.integer(grouping)) {
        if (length(grouping) != length(x))
            stop("when 'grouping' is a factor, character vector, or integer ",
                 "vector, it must have the same length as 'x'")
        return(split(seq_along(x), grouping))
    }
    x_names <- names(x)
    ## (d)
    if (is(grouping, "GRangesList")) {
        if (is.null(x_names))
            stop("when 'grouping' is a GRangesList, 'x' must be named ",
                 "with transcript ids (tx_id) or transcript names (tx_name)")
        grouping <- .checkOrMakeUniqueGroupingNames(grouping)
        mcols <- mcols(grouping@unlistData)
        mcolnames <- colnames(mcols)
        for (colname in c("tx_id", "tx_name")) {
            idx <- which(mcolnames == colname)
            if (length(idx) == 0L)
                next
            if (length(idx) >= 2L)
                stop("'unlist(grouping)' has more than 1 ",
                     colname, " metadata column")
            m <- match(mcols[[idx]], x_names)
            if (any(is.na(m)))
                next
            return(relist(m, PartitioningByEnd(grouping)))
        }
        stop("'unlist(grouping)' has no tx_id or tx_name column, ",
             "or they contain values that are not in 'names(x)'")
    }
    ## (e)
    if (is.data.frame(grouping) || is(grouping, "DataFrame")) {
        if (is.null(x_names))
            stop("when 'grouping' is a data.frame or a DataFrame, 'x' ",
                 "must be named with transcript ids (tx_id) or transcript ",
                 "names (tx_name)")
        grouping_colnames <- colnames(grouping)
        idx <- which(grouping_colnames == "gene_id")
        if (length(idx) != 1L)
            stop("when 'grouping' is a data.frame or a DataFrame, it must ",
                 "have exactly 1 gene_id column")
        gene_id <- grouping[[idx]]
        if (!is.factor(gene_id) && !is.character(gene_id)
         && !is.integer(gene_id))
            stop("'grouping$gene_id' must be a factor, character vector, ",
                 "or integer vector")
        for (colname in c("tx_id", "tx_name")) {
            idx <- which(grouping_colnames == colname)
            if (length(idx) == 0L)
                next
            if (length(idx) >= 2L)
                stop("'grouping' has more than 1 ", colname, " column")
            m <- match(grouping[[idx]], x_names)
            if (any(is.na(m)))
                next
            return(split(m, gene_id))
        }
        stop("'grouping' has no tx_id or tx_name column, ",
             "or they contain values that are not in 'names(x)'")
    }
    stop("invalid 'grouping'")

}

### TODO: Improve handling of invalid genes i.e. provide more details about
### which genes were considered invalid and why.
.make_ex_by_tx_from_GRangesList <- function(x, grouping=NULL,
                                               min.ntx=2, max.ntx=NA,
                                               check.introns=TRUE)
{
    if (!is(x, "GRangesList"))
        stop("'x' must be a GRangesList object")
    if (is.null(grouping)) {
        if (!identical(min.ntx, 2) || !identical(max.ntx, NA))
            stop("the 'min.ntx' and 'max.ntx' args are not supported ",
                 "when 'grouping' is not supplied or NULL")
        ans <- .setSplicingGraphInfo(x, check.introns=check.introns)
        names(ans) <- NULL
        return(ans)
    }

    ## Check 'min.ntx'.
    if (!isSingleNumber(min.ntx))
        stop("'min.ntx' must be a single number")
    if (!is.integer(min.ntx))
        min.ntx <- as.integer(min.ntx)
    if (min.ntx < 1L)
        stop("'min.ntx' must be >= 1")

    ## Check 'max.ntx'.
    if (!isSingleNumberOrNA(max.ntx))
        stop("'max.ntx' must be a single number or NA")
    if (!is.integer(max.ntx))
        max.ntx <- as.integer(max.ntx)
    if (!is.na(max.ntx) && max.ntx < min.ntx)
        stop("'max.ntx' must be >= 'min.ntx'")

    grouping <- .normargGrouping(grouping, x)

    ## Keep genes with nb of transcripts >= min.ntx and <= max.ntx.
    grouping_eltNROWS <- elementNROWS(grouping)
    keep <- grouping_eltNROWS >= min.ntx
    if (!is.na(max.ntx))
        keep <- keep & grouping_eltNROWS <= max.ntx
    grouping <- grouping[keep]
    if (length(grouping) == 0L) {
        ## We need to return an empty GRangesList but it cannot be simply
        ## made with GRangesList() or it wouldn't have the expected metadata
        ## cols (outer and inner). So we call .setSplicingGraphInfo() on an
        ## empty gene instead.
        ans <- .setSplicingGraphInfo(x[NULL], check.introns=check.introns)
        names(ans) <- character(0)
        return(ans)
    }

    ## Main loop.
    ans <- lapply(seq_along(grouping),
                  function(i) {
                      ii <- grouping[[i]]
                      gene <- x[ii]
                      gene2 <- try(.setSplicingGraphInfo(gene,
                                       check.introns=check.introns),
                                   silent=TRUE)
                      if (inherits(gene2, "try-error"))
                          return(NULL)
                      gene2
                  })

    invalid_genes_idx <- which(sapply(ans, is.null))
    nb_invalid_genes <- length(invalid_genes_idx)
    if (nb_invalid_genes != 0L) {
        warning(nb_invalid_genes, " invalid ",
                ifelse(nb_invalid_genes == 1L, "gene was", "genes were"),
                " dropped")
        grouping <- grouping[-invalid_genes_idx]
        ans <- ans[-invalid_genes_idx]
    }
    ans <- do.call(c, ans)
    grouping_names <- names(grouping)
    if (!is.null(grouping_names)) {
        ans_names <- rep.int(grouping_names, elementNROWS(grouping))
        names(ans) <- ans_names
    }
    ans
}

setGeneric("SplicingGraphs", signature="x",
    function(x, grouping=NULL, min.ntx=2, max.ntx=NA, check.introns=TRUE)
        standardGeneric("SplicingGraphs")
)

setMethod("SplicingGraphs", "GRangesList",
    function(x, grouping=NULL, min.ntx=2, max.ntx=NA, check.introns=TRUE)
    {
        ans_ex_by_tx <- .make_ex_by_tx_from_GRangesList(x, grouping=grouping,
                            min.ntx=min.ntx, max.ntx=max.ntx,
                            check.introns=check.introns)
        ans_genes <- GeneModel(ans_ex_by_tx)
        mcols(ans_ex_by_tx) <- NULL
        ans_in_by_tx <- setdiff(range(ans_ex_by_tx), ans_ex_by_tx)
        common_strand <- commonStrand.GRangesList(ans_in_by_tx)
        ans_in_by_tx <- revElements(ans_in_by_tx, common_strand == "-")
        ans_bubbles_cache <- new.env(parent=emptyenv())
        new("SplicingGraphs", genes=ans_genes,
                              in_by_tx=ans_in_by_tx,
                              .bubbles_cache=ans_bubbles_cache)
    }
)

setMethod("SplicingGraphs", "TxDb",
    function(x, grouping=NULL, min.ntx=2, max.ntx=NA, check.introns=TRUE)
    {
        if (!is.null(grouping))
            stop("the 'grouping' arg is not supported ",
                 "when 'x' is a TxDb object")
        ex_by_tx <- exonsBy(x, by="tx", use.names=TRUE)
        tx_by_gn <- transcriptsBy(x, by="gene")
        SplicingGraphs(ex_by_tx, tx_by_gn, min.ntx=min.ntx, max.ntx=max.ntx,
                       check.introns=check.introns)
    }
)

### Not exported. Only used in the .onLoad hook (see zzz.R file) for fixing
### the prototypes of the GeneModel and SplicingGraphs classes.
emptySplicingGraphs <- function()
{
    ex_by_tx <- GRangesList()
    names(ex_by_tx) <- character(0)
    mcols(ex_by_tx@unlistData) <- DataFrame(exon_id=integer(0),
                                            exon_name=character(0),
                                            exon_rank=integer(0))
    ans <- SplicingGraphs(ex_by_tx, IntegerList())
    ## We want to make sure 'ans' is *completely* valid.
    validObject(ans, complete=TRUE)
    ans
}