Browse code

Change SplicingGraphs class definition and API so SplicingGraphs objects are now list-like objects with 1 gene per list element.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SplicingGraphs@74180 bc3139a8-67e5-0310-9ffc-ced21a209358

Herve Pages authored on 11/03/2013 21:18:32
Showing 13 changed files

... ...
@@ -1,6 +1,6 @@
1 1
 Package: SplicingGraphs
2 2
 Title: Tools for creating splicing graphs from annotations and RNA-Seq data
3
-Version: 0.5.0
3
+Version: 0.5.1
4 4
 Author: D. Bindreither, M. Carlson, M. Morgan, H. Pages
5 5
 License: Artistic-2.0
6 6
 Description: This package provides tools for creating splicing graphs based on
... ...
@@ -25,12 +25,7 @@ exportClasses(SplicingGraphs)
25 25
 ###
26 26
 
27 27
 exportMethods(
28
-    length,
29
-    names,
30
-    elementLengths,
31
-    plot,
32
-    findOverlaps,
33
-    encodeOverlaps
28
+    plot
34 29
 )
35 30
 
36 31
 
... ...
@@ -3,26 +3,49 @@
3 3
 ### -------------------------------------------------------------------------
4 4
 
5 5
 
6
-### We deliberately choose to not extend GRangesList to make SplicingGraphs
7
-### objects read-only and with a very restricted API (opaque objects).
8 6
 setClass("SplicingGraphs",
7
+    contains="CompressedList",
9 8
     representation(
10
-        tx="GRangesList"
9
+        unlistData="GRangesList",
10
+        elementMetadata="DataFrame"
11
+    ),
12
+    prototype(
13
+        elementType="GRangesList"
11 14
     )
12 15
 )
13 16
 
14 17
 
15 18
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
16
-### Basic accessors.
17
-###
18
-### We support only a very small subset of getters from the GRangesList API.
19
+### Validity.
19 20
 ###
20 21
 
21
-setMethod("length", "SplicingGraphs", function(x) length(x@tx))
22
+.valid.SplicingGraphs.names <- function(x)
23
+{
24
+    x_names <- names(x)
25
+    if (is.null(x_names)) {
26
+        if (length(x) == 1L)
27
+            return(NULL)
28
+        return("'x' must have names")
29
+    }
30
+    if (anyDuplicated(x_names))
31
+        return("'x' has duplicated names")
32
+    NULL
33
+}
34
+
35
+.valid.SplicingGraphs.unlistData <- function(x)
36
+{
37
+    x_unlistData <- x@unlistData
38
+    if (!is.null(x_unlistData))
39
+        return("'x@unlistData' must be unnamed")
40
+    NULL
41
+}
22 42
 
23
-setMethod("names", "SplicingGraphs", function(x) names(x@tx))
43
+.valid.SplicingGraphs <- function(x)
44
+{
45
+    c(.valid.SplicingGraphs.names(x), .valid.SplicingGraphs.unlistData(x))
46
+}
24 47
 
25
-setMethod("elementLengths", "SplicingGraphs", function(x) elementLengths(x@tx))
48
+setValidity2("SplicingGraphs", .valid.SplicingGraphs)
26 49
 
27 50
 
28 51
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
... ...
@@ -32,14 +55,9 @@ setMethod("elementLengths", "SplicingGraphs", function(x) elementLengths(x@tx))
32 55
 setMethod("show", "SplicingGraphs",
33 56
     function(object)
34 57
     {
35
-        ntx <- length(object)
36
-        object_names <- names(object)
37
-        if (is.null(object_names)) {
38
-            ngenes <- ifelse(ntx == 0L, 0L, 1L)
39
-        } else {
40
-            ngenes <- length(unique(object_names))
41
-        }
42
-        cat(class(object), " object with ", ngenes, " gene(s) ",
58
+        ngene <- length(object)
59
+        ntx <- length(unlist(object, use.names=FALSE))
60
+        cat(class(object), " object with ", ngene, " gene(s) ",
43 61
             "and ", ntx, " transcript(s)\n", sep="")
44 62
     }
45 63
 )
... ...
@@ -270,9 +288,9 @@ setMethod("show", "SplicingGraphs",
270 288
 
271 289
 ### TODO: Improve handling of invalid genes i.e. provide more details about
272 290
 ### which genes were considered invalid and why.
273
-.make_SplicingGraphs_from_GRangesList <- function(x, grouping=NULL,
274
-                                                  min.ntx=2, max.ntx=NA,
275
-                                                  check.introns=TRUE)
291
+.make_unlisted_SplicingGraphs_from_GRangesList <- function(x, grouping=NULL,
292
+                                                      min.ntx=2, max.ntx=NA,
293
+                                                      check.introns=TRUE)
276 294
 {
277 295
     if (!is(x, "GRangesList"))
278 296
         stop("'x' must be a GRangesList object")
... ...
@@ -349,10 +367,20 @@ setGeneric("SplicingGraphs", signature="x",
349 367
 setMethod("SplicingGraphs", "GRangesList",
350 368
     function(x, grouping=NULL, min.ntx=2, max.ntx=NA, check.introns=TRUE)
351 369
     {
352
-        ans_tx <- .make_SplicingGraphs_from_GRangesList(x,
353
-                      grouping=grouping, min.ntx=min.ntx, max.ntx=max.ntx,
354
-                      check.introns=check.introns)
355
-        new("SplicingGraphs", tx=ans_tx)
370
+        ans_unlistData <- .make_unlisted_SplicingGraphs_from_GRangesList(x,
371
+                            grouping=grouping, min.ntx=min.ntx, max.ntx=max.ntx,
372
+                            check.introns=check.introns)
373
+        ans_unlistData_names <- names(ans_unlistData)
374
+        if (is.null(ans_unlistData_names)) {
375
+            ans_partitioning <- PartitioningByEnd(length(ans_unlistData))
376
+        } else {
377
+            names(ans_unlistData) <- NULL
378
+            ans_end <- end(Rle(ans_unlistData_names))
379
+            ans_names <- ans_unlistData_names[ans_end]
380
+            ans_partitioning <- PartitioningByEnd(ans_end, names=ans_names)
381
+        }
382
+        IRanges:::newCompressedList0("SplicingGraphs",
383
+                                     ans_unlistData, ans_partitioning)
356 384
     }
357 385
 )
358 386
 
... ...
@@ -3,27 +3,6 @@
3 3
 ### compatible hits per exon or per intron
4 4
 ### -------------------------------------------------------------------------
5 5
 
6
-setMethod("findOverlaps", c("GRangesList", "SplicingGraphs"),
7
-    function(query, subject, maxgap=0L, minoverlap=1L,
8
-             type=c("any", "start", "end", "within", "equal"),
9
-             select=c("all", "first", "last", "arbitrary"),
10
-             ignore.strand=ignore.strand)
11
-    {
12
-        findOverlaps(query, subject@tx,
13
-                     maxgap=maxgap, minoverlap=minoverlap,
14
-                     type=match.arg(type), select=match.arg(select),
15
-                     ignore.strand=ignore.strand)
16
-    }
17
-)
18
-
19
-setMethod("encodeOverlaps", c("GRangesList", "SplicingGraphs"),
20
-    function(query, subject, hits=NULL, flip.query.if.wrong.strand=FALSE)
21
-    {
22
-        encodeOverlaps(query, subject@tx,
23
-                       hits=hits,
24
-                       flip.query.if.wrong.strand=flip.query.if.wrong.strand)
25
-    }
26
-)
27 6
 
28 7
 ### 'query': a named GRangesList object containing gapped reads.
29 8
 ### 'subject': a GRangesList object containing some subfeature (e.g. exons
... ...
@@ -58,19 +58,17 @@ setMethod("txpaths", "SplicingGraphs",
58 58
             stop("'as.matrix' must be TRUE or FALSE")
59 59
         if (length(x) == 0L)
60 60
             stop("'x' must be of length >= 1")
61
-        x_names <- names(x)
62
-        ans <- mcols(x@tx)[ , "txpaths"]
63
-        if (!is.null(x_names)) {
61
+        unlisted_x <- unlist(x)
62
+        unlisted_names <- names(unlisted_x)
63
+        ans <- mcols(unlisted_x)[ , "txpaths"]
64
+        if (!is.null(unlisted_names)) {
64 65
             if (is.na(gene_id))
65 66
                 stop("'gene_id' must be supplied when 'x' has names")
66
-            ans <- ans[x_names == gene_id]
67
+            ans <- ans[unlisted_names == gene_id]
67 68
             if (length(ans) == 0L)
68 69
                 stop("invalid 'gene_id'")
69 70
         } else if (!is.na(gene_id)) {
70
-            stop("the 'gene_id' arg is not supported ",
71
-                 "when 'x' is unnamed (in which case all its elements ",
72
-                 "(i.e. transcripts) are considered to belong to the ",
73
-                 "same gene)")
71
+            stop("the 'gene_id' arg is not supported when 'x' is unnamed")
74 72
         }
75 73
         if (as.matrix)
76 74
             ans <- make_matrix_from_txpaths(ans)
... ...
@@ -95,21 +93,19 @@ setMethod("UATXHcount", "SplicingGraphs",
95 93
             stop("'gene_id' must be a single string (or NA)")
96 94
         if (length(x) == 0L)
97 95
             stop("'x' must be of length >= 1")
98
-        x_names <- names(x)
99
-        ans <- mcols(x@tx)[["UATXHcount"]]
100
-        if (!is.null(x_names)) {
96
+        unlisted_x <- unlist(x)
97
+        unlisted_names <- names(unlisted_x)
98
+        ans <- mcols(unlisted_x)[["UATXHcount"]]
99
+        if (!is.null(unlisted_names)) {
101 100
             if (is.na(gene_id))
102 101
                 stop("'gene_id' must be supplied when 'x' has names")
103 102
             if (is.null(ans))
104 103
                 return(ans)
105
-            ans <- ans[x_names == gene_id]
104
+            ans <- ans[unlisted_names == gene_id]
106 105
             if (length(ans) == 0L)
107 106
                 stop("invalid 'gene_id'")
108 107
         } else if (!is.na(gene_id)) {
109
-            stop("the 'gene_id' arg is not supported ",
110
-                 "when 'x' is unnamed (in which case all its elements ",
111
-                 "(i.e. transcripts) are considered to belong to the ",
112
-                 "same gene)")
108
+            stop("the 'gene_id' arg is not supported when 'x' is unnamed")
113 109
         }
114 110
         ans
115 111
     }
... ...
@@ -277,17 +273,19 @@ setMethod("sgedges", "ANY",
277 273
         if (!is(x, "SplicingGraphs"))
278 274
             stop("'x' must be a SplicingGraphs object ",
279 275
                  "when 'in_by_tx' is a GRangesList object")
280
-        if (length(in_by_tx) != length(x))
281
-            stop("'in_by_tx' must have the same length as 'x'")
282
-        if (!identical(elementLengths(in_by_tx) + 1L, elementLengths(x)))
276
+        unlisted_x <- unlist(x)
277
+        if (length(in_by_tx) != length(unlisted_x))
278
+            stop("'in_by_tx' must have the same length as 'unlist(x)'")
279
+        if (!identical(elementLengths(in_by_tx) + 1L,
280
+                       elementLengths(unlisted_x)))
283 281
             stop("the shape of 'in_by_tx' is not compatible ",
284
-                 "with the shape of 'x'")
282
+                 "with the shape of 'unlist(x)'")
285 283
         if (!identical(keep.dup.edges, FALSE))
286 284
             stop("'keep.dup.edges' must be FALSE when 'in_by_tx' is supplied")
287 285
         sgedges0 <- sgedges(txpaths, UATXHcount=UATXHcount,
288 286
                                      keep.dup.edges=TRUE)
289 287
         ex_or_in <- sgedges0[ , "ex_or_in"]
290
-        ex_hits <- .hits(x@tx, gene_id=gene_id)
288
+        ex_hits <- .hits(unlisted_x, gene_id=gene_id)
291 289
         if (is.null(ex_hits))
292 290
             stop("'x' must have a \"hits\" inner metadata column ",
293 291
                  "when 'in_by_tx' is a GRangesList object. May be ",
... ...
@@ -248,9 +248,11 @@ slideshow <- function(x)
248 248
 {
249 249
     if (!is(x, "SplicingGraphs"))
250 250
         stop("'x' must be a SplicingGraphs object")
251
-    for (gene_id in unique(names(x))) {
252
-        ntx <- sum(names(x) == gene_id)
253
-        cat("Plotting gene ", gene_id, " (", ntx, " transcripts). ", sep="")
251
+    x_eltlen <- elementLengths(x)
252
+    for (gene_id in names(x)) {
253
+        ntx <- x_eltlen[[gene_id]]
254
+        cat("Plotting splicing graph for gene \"", gene_id, "\" ",
255
+            "(", ntx, " transcript(s)). ", sep="")
254 256
         plot(x, gene_id)
255 257
         cat("Press <Enter> for next...")
256 258
         readLines(n=1)
... ...
@@ -29,13 +29,15 @@ loadModels <- function(models_path, check.transcripts=TRUE)
29 29
 ### I guess not...
30 30
 makeSgedgesWithHits <- function(grl, sg)
31 31
 {
32
-    ov0 <- findOverlaps(grl, sg@tx, ignore.strand=TRUE)
33
-    ovenc0 <- encodeOverlaps(grl, sg@tx, hits=ov0,
32
+    unlisted_sg <- unlist(sg)
33
+    ov0 <- findOverlaps(grl, unlisted_sg, ignore.strand=TRUE)
34
+    ovenc0 <- encodeOverlaps(grl, unlisted_sg, hits=ov0,
34 35
                              flip.query.if.wrong.strand=TRUE)
35 36
     ov0_is_comp <- isCompatibleWithSplicing(ovenc0)
36 37
     ov1 <- ov0[ov0_is_comp]
37
-    sg@tx <- assignSubfeatureHits(grl, sg@tx, ov1, ignore.strand=TRUE)
38
-    in_by_tx <- psetdiff(range(sg@tx), sg@tx)
38
+    sg@unlistData <- unname(assignSubfeatureHits(grl, unlisted_sg, ov1,
39
+                                                 ignore.strand=TRUE))
40
+    in_by_tx <- psetdiff(range(unlisted_sg), unlisted_sg)
39 41
     in_by_tx <- assignSubfeatureHits(grl, in_by_tx, ov1, ignore.strand=TRUE)
40 42
     sgedges(sg, in_by_tx=in_by_tx)
41 43
 }
... ...
@@ -51,7 +51,7 @@ library(Gviz)
51 51
 # Plotting to the PDF device produces an incomplete plot (ax_track is missing)!
52 52
 # Looks like a bug in Gviz.
53 53
 ax_track <- GenomeAxisTrack()
54
-grl <- sg@tx[names(sg@tx) == "117286"]
54
+grl <- sg[["117286"]]
55 55
 ### We create 1 track per transcript.
56 56
 tx_tracks <- lapply(seq_along(grl),
57 57
                     function(i) {
... ...
@@ -85,7 +85,7 @@ library(Gviz)
85 85
 # Plotting to the PDF device produces an incomplete plot (ax_track is missing)!
86 86
 # Looks like a bug in Gviz.
87 87
 ax_track <- GenomeAxisTrack()
88
-grl <- sg@tx[names(sg@tx) == "126017"]
88
+grl <- sg[["126017"]]
89 89
 ### We create 1 track per transcript.
90 90
 tx_tracks <- lapply(seq_along(grl),
91 91
                     function(i) {
... ...
@@ -7,9 +7,6 @@
7 7
 \alias{SplicingGraphs,GRangesList-method}
8 8
 \alias{SplicingGraphs,TranscriptDb-method}
9 9
 
10
-\alias{length,SplicingGraphs-method}
11
-\alias{names,SplicingGraphs-method}
12
-\alias{elementLengths,SplicingGraphs-method}
13 10
 \alias{show,SplicingGraphs-method}
14 11
 
15 12
 
... ...
@@ -24,23 +21,13 @@
24 21
 
25 22
 \usage{
26 23
 SplicingGraphs(x, grouping=NULL, min.ntx=2, max.ntx=NA, check.introns=TRUE)
27
-
28
-## Basic accessors:
29
-
30
-\S4method{length}{SplicingGraphs}(x)
31
-\S4method{names}{SplicingGraphs}(x)
32
-\S4method{elementLengths}{SplicingGraphs}(x)
33 24
 }
34 25
 
35 26
 \arguments{
36 27
   \item{x}{
37
-    For \code{SplicingGraphs}: A \link[GenomicRanges]{GRangesList} object
38
-    containing the exons of one or more genes grouped by transcripts.
39
-    Alternatively, \code{x} can be a \link[GenomicFeatures]{TranscriptDb}
40
-    object. See Details section below.
41
-
42
-    For the \code{length}, \code{names}, and \code{elementLengths} methods
43
-    for SplicingGraphs objects: A SplicingGraphs object.
28
+    A \link[GenomicRanges]{GRangesList} object containing the exons of one
29
+    or more genes grouped by transcripts. Alternatively, \code{x} can be a
30
+    \link[GenomicFeatures]{TranscriptDb} object. See Details section below.
44 31
   }
45 32
   \item{grouping}{
46 33
     An optional object that represents the grouping by gene of the top-level
... ...
@@ -183,12 +170,17 @@ sg2 <- SplicingGraphs(toy_genes_txdb)
183 170
 stopifnot(identical(sg2, sg))
184 171
 sg
185 172
 
186
-## 'sg' has 1 element per transcript, and each transcript is
187
-## assigned a name that is the id of the gene it belongs to. All the
188
-## transcripts belonging to the same gene are guaranteed to be
189
-## consecutive elements in 'sg'.
173
+## 'sg' has 1 element per gene. 'names(sg)' are the gene ids.
190 174
 names(sg)
191 175
 
176
+## The transcripts of a given gene can be extracted as an *unnamed*
177
+## GRangesList object with [[:
178
+sg[["geneB"]]
179
+
180
+## The transcripts of all the genes can be extracted as a *named*
181
+## GRangesList object with unlist():
182
+unlist(sg)
183
+
192 184
 ## ---------------------------------------------------------------------
193 185
 ## 3. Extract information from the SplicingGraphs object
194 186
 ## ---------------------------------------------------------------------
... ...
@@ -57,10 +57,7 @@ bubbles(x, gene_id=NA)
57 57
 example(SplicingGraphs)  # create SplicingGraphs object 'sg'
58 58
 sg
59 59
 
60
-## 'sg' has 1 element per transcript, and each transcript is
61
-## assigned a name that is the id of the gene it belongs to. All the
62
-## transcripts belonging to the same gene are guaranteed to be
63
-## consecutive elements in 'sg'.
60
+## 'sg' has 1 element per gene. 'names(sg)' are the gene ids.
64 61
 names(sg)
65 62
 
66 63
 plot(sgraph(sg, gene_id="geneA", tx_id.as.edge.label=TRUE))
... ...
@@ -69,7 +69,7 @@ gal
69 69
 
70 70
 ## Find the overlaps between the reads and the transcripts:
71 71
 grl <- grglist(gal, order.as.in.query=TRUE)
72
-ov0 <- findOverlaps(grl, sg, ignore.strand=TRUE)
72
+ov0 <- findOverlaps(grl, unlist(sg), ignore.strand=TRUE)
73 73
 
74 74
 ## Nb of hits:
75 75
 head(countQueryHits(ov0))  # per read
... ...
@@ -77,7 +77,7 @@ head(countSubjectHits(ov0))  # per transcript
77 77
 
78 78
 ## Keep only overlaps that are "compatible" with the splicing of the
79 79
 ## transcripts:
80
-ovenc0 <- encodeOverlaps(grl, sg, hits=ov0,
80
+ovenc0 <- encodeOverlaps(grl, unlist(sg), hits=ov0,
81 81
                          flip.query.if.wrong.strand=TRUE)
82 82
 ov0_is_comp <- isCompatibleWithSplicing(ovenc0)
83 83
 ov1 <- ov0[ov0_is_comp]
... ...
@@ -88,7 +88,7 @@ ov1 <- ov0[ov0_is_comp]
88 88
 
89 89
 ov2 <- ov1[queryHits(ov1) \%in\% which(countQueryHits(ov1) == 1L)]
90 90
 UATXHcount <- countSubjectHits(ov2)
91
-mcols(sg@tx)$UATXHcount <- UATXHcount
91
+mcols(sg@unlistData)$UATXHcount <- UATXHcount
92 92
 
93 93
 names(sg)  # valid gene ids
94 94
 plot(sg, "geneA")
... ...
@@ -100,10 +100,12 @@ plot(sg, "geneA")
100 100
 ## ---------------------------------------------------------------------
101 101
 
102 102
 ## Assign compatible hits for each exon:
103
-sg@tx <- assignSubfeatureHits(grl, sg@tx, ov1, ignore.strand=TRUE)
103
+unlisted_sg <- unlist(sg)
104
+sg@unlistData <- unname(assignSubfeatureHits(grl, unlisted_sg, ov1,
105
+                                             ignore.strand=TRUE))
104 106
 
105 107
 ## Assign compatible hits for each intron:
106
-in_by_tx <- psetdiff(range(sg@tx), sg@tx)
108
+in_by_tx <- psetdiff(range(unlisted_sg), unlisted_sg)
107 109
 in_by_tx <- assignSubfeatureHits(grl, in_by_tx, ov1, ignore.strand=TRUE)
108 110
 
109 111
 sgedges(sg, gene_id="geneA", in_by_tx=in_by_tx)
... ...
@@ -107,10 +107,7 @@ uninformativeSSids(x, gene_id=NA)
107 107
 example(SplicingGraphs)  # create SplicingGraphs object 'sg'
108 108
 sg
109 109
 
110
-## 'sg' has 1 element per transcript, and each transcript is
111
-## assigned a name that is the id of the gene it belongs to. All the
112
-## transcripts belonging to the same gene are guaranteed to be
113
-## consecutive elements in 'sg'.
110
+## 'sg' has 1 element per gene. 'names(sg)' are the gene ids.
114 111
 names(sg)
115 112
 
116 113
 sgedges(sg, gene_id="geneD")
... ...
@@ -119,5 +116,5 @@ outdeg(sg, gene_id="geneD")
119 116
 indeg(sg, gene_id="geneD")
120 117
 
121 118
 txpaths(sg, gene_id="geneD")
122
-txpaths(sg, gene_id="geneD", as.matrix=TRUE)
119
+txpaths(sg, gene_id="geneD", as.matrix=TRUE)  # splicing matrix
123 120
 }
... ...
@@ -83,10 +83,7 @@ slideshow(x)
83 83
 example(SplicingGraphs)  # create SplicingGraphs object 'sg'
84 84
 sg
85 85
 
86
-## 'sg' has 1 element per transcript, and each transcript is
87
-## assigned a name that is the id of the gene it belongs to. All the
88
-## transcripts belonging to the same gene are guaranteed to be
89
-## consecutive elements in 'sg'.
86
+## 'sg' has 1 element per gene. 'names(sg)' are the gene ids.
90 87
 names(sg)
91 88
 
92 89
 sgA <- sgraph(sg, gene_id="geneA", tx_id.as.edge.label=TRUE)