Browse code

- Change the order of the edge-level metadata cols in the GRangesList object returned by sgedgesByTranscript() and sgedgesByGene(): the "sgedge_id" col (global splicing graph edge id) now is in 3rd position (instead of 1st), after the "from" and "to" cols.

- By default, the output of sgedges() now also includes the "sgedge_id" col,
so it has the same cols as the edge-level metadata cols returned by default
by sgedgesByTranscript() and sgedgesByGene().

- Change the order of the edge-level metadata cols in the GRangesList object
returned by rsgedgesByGene(): the "rsgedge_id" col (global reduced splicing
graph edge id) now is also in 3rd position (instead of 1st), after the
"from" and "to" cols.

- By default, the output of rsgedges() now also includes the "rsgedge_id" col,
so it has the same cols as the edge-level metadata cols returned by default
by rsgedgesByGene().

- The "tx_id" col in the output of all the above functions now is a
CharacterList instead of a broken IntegerList that was holding a factor in
its unlistData slot.


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SplicingGraphs@75803 bc3139a8-67e5-0310-9ffc-ced21a209358

Herve Pages authored on 19/04/2013 10:13:26
Showing 8 changed files

... ...
@@ -6,8 +6,8 @@
6 6
 ### *reduced* splicing graphs.
7 7
 ###
8 8
 
9
-### Return the uninformative fully qualified nodes.
10
-.uninformative_fqnodes <- function(sg)
9
+### Return the fully qualified uninformative nodes.
10
+.get_fq_uninfnodes <- function(sg)
11 11
 {
12 12
     txpath <- txpath(unlist(sg))
13 13
     skeleton2 <- PartitioningByEnd(end(PartitioningByEnd(txpath)) * 2L)
... ...
@@ -25,65 +25,84 @@
25 25
     gene_id <- gene_id[c(TRUE, FALSE)]
26 26
     from <- tmp[c(TRUE, FALSE)]
27 27
     to <- tmp[c(FALSE, TRUE)]
28
-    sgedge_id <- paste0(gene_id, ":", from, ",", to)
28
+    sgedge_id <- make_global_sgedge_id(gene_id, from, to)
29 29
     keep_idx <- which(!duplicated(sgedge_id))
30
-    fqfrom <- paste0(gene_id, ":", from)[keep_idx]  # fully qualified ids
31
-    fqto <- paste0(gene_id, ":", to)[keep_idx]  # fully qualified ids
32
-    uninformative_sgnodes(fqfrom, fqto)
30
+    fq_from <- paste(gene_id, from, sep=":")[keep_idx]  # fully qualified ids
31
+    fq_to <- paste(gene_id, to, sep=":")[keep_idx]  # fully qualified ids
32
+    uninformative_sgnodes(fq_from, fq_to)
33 33
 }
34 34
 
35
-.pmerge <- function(x, y)
35
+### 'f': factor. The "reverse factor" of 'f' is the named list of integer
36
+### vectors that maps each level of 'f' to the positions in 'f' where that
37
+### level is used. It can quickly be computed with:
38
+###
39
+###     revfactor <- split(seq_along(f), f)
40
+###
41
+### 'f' can be rebuilt from 'revfactor' with:
42
+###
43
+###     f2 <- .make_factor_from_revfactor(revfactor, length(f))
44
+###     stopifnot(identical(f2, f)
45
+###
46
+.make_factor_from_revfactor <- function(revfactor, f_len)
36 47
 {
37
-    x_partitioning <- PartitioningByEnd(x)
38
-    y_partitioning <- PartitioningByEnd(y)
39
-    stopifnot(identical(x_partitioning, y_partitioning))
40
-    starts <- start(x_partitioning)
41
-    ends <- end(x_partitioning)
42
-    unlisted_x <- unlist(x, use.names=FALSE)
43
-    unlisted_y <- unlist(y, use.names=FALSE)
44
-    stopifnot(identical(unlisted_x[-starts], unlisted_y[-ends]))
45
-    y <- as(unlisted_y[ends], "List")
46
-    xy <- c(x, y)
47
-    f <- rep.int(seq_along(x), 2L)
48
-    ans <- unlistAndSplit(xy, f)
49
-    names(ans) <- names(x_partitioning)
50
-    ans
48
+    f_levels <- names(revfactor)
49
+    idx <- integer(f_len)
50
+    idx[] <- NA_integer_
51
+    idx[unlist(revfactor, use.names=FALSE)] <- rep.int(seq_along(revfactor),
52
+                                                   elementLengths(revfactor))
53
+    factor(f_levels[idx], levels=f_levels)
54
+}
55
+
56
+### 'from' and 'to' must have the same length N (nb of unique edges in the
57
+### SplicingGraphs object before reduction).
58
+.make_revfactor_from_uninfnodes <- function(uninfnodes, from, to)
59
+{
60
+    from_idx <- match(uninfnodes, from)
61
+    to_idx <- match(uninfnodes, to)
62
+    keep_idx <- which(!(is.na(from_idx) | is.na(to_idx)))
63
+    from_idx <- from_idx[keep_idx]
64
+    to_idx <- to_idx[keep_idx]
65
+    stopifnot(all(from_idx == to_idx + 1L))  # sanity check
66
+    if (length(from_idx) == 0L) {
67
+        group1 <- integer(0)
68
+    } else {
69
+        group1 <- cumsum(c(TRUE, diff(from_idx) != 1L))
70
+    }
71
+    split_to_idx <- unname(splitAsList(to_idx, group1))
72
+    split_from_idx <- unname(splitAsList(from_idx, group1))
73
+    fancy_punion(split_to_idx, split_from_idx)
51 74
 }
52 75
 
53
-### 'gene_id', 'from', and 'to', must have the same length N (nb of unique
54
-### edges in the SplicingGraphs object befor reduction).
76
+### 'revfactor' must be a "reverse factor" as returned by
77
+### .make_revfactor_from_uninfnodes().
55 78
 ### Returns a character vector of length N containing the global rsgedge id
56 79
 ### (global reduced splicing graph edge id) corresponding to each input edge.
57
-.build_sgedge2rsgedge_map <- function(gene_id, from, to, ui_fqnodes)
80
+.build_sgedge2rsgedge_map_from_revfactor <- function(revfactor,
81
+                                                     gene_id, from, to)
58 82
 {
59
-    ans <- paste0(gene_id, ":", from, ",", to)
60
-
61
-    fqfrom <- paste0(gene_id, ":", from)  # fully qualified ids
62
-    fqto <- paste0(gene_id, ":", to)  # fully qualified ids
63
-    idx1 <- match(ui_fqnodes, fqfrom)
64
-    idx2 <- match(ui_fqnodes, fqto)
65
-    keep_idx <- which(!(is.na(idx1) | is.na(idx2)))
66
-    idx1 <- idx1[keep_idx]
67
-    idx2 <- idx2[keep_idx]
68
-    stopifnot(all(idx1 == idx2 + 1L))  # sanity check
69
-    group1 <- cumsum(c(TRUE, diff(idx1) != 1L))
70
-    split_idx2 <- unname(splitAsList(idx2, group1))
71
-    split_idx1 <- unname(splitAsList(idx1, group1))
72
-    idx <- .pmerge(split_idx2, split_idx1)
73
-    alter_idx <- idx@unlistData
74
-
75
-    from_list <- relist(from[alter_idx], idx)
76
-    to_list <- relist(to[alter_idx], idx)
77
-    nodes_list <- .pmerge(from_list, to_list)
83
+    ans <- make_global_sgedge_id(gene_id, from, to)
84
+
85
+    unlisted_revfactor <- unlist(revfactor, use.names=FALSE)
86
+    from_list <- relist(from[unlisted_revfactor], revfactor)
87
+    to_list <- relist(to[unlisted_revfactor], revfactor)
88
+    nodes_list <- fancy_punion(from_list, to_list)
78 89
 
79 90
     rsgedge_id <- sapply(nodes_list, paste0, collapse=",")
80
-    rsgedge_id <- rep.int(rsgedge_id, elementLengths(idx))
81
-    rsgedge_id <- paste(gene_id[alter_idx], rsgedge_id, sep=":")
91
+    rsgedge_id <- rep.int(rsgedge_id, elementLengths(revfactor))
92
+    rsgedge_id <- paste(gene_id[unlisted_revfactor], rsgedge_id, sep=":")
82 93
 
83
-    ans[alter_idx] <- rsgedge_id
94
+    ans[unlisted_revfactor] <- rsgedge_id
84 95
     ans
85 96
 }
86 97
 
98
+.build_sgedge2rsgedge_map <- function(uninfnodes, gene_id, from, to)
99
+{
100
+    fq_from <- paste(gene_id, from, sep=":")  # fully qualified ids
101
+    fq_to <- paste(gene_id, to, sep=":")  # fully qualified ids
102
+    revfactor <- .make_revfactor_from_uninfnodes(uninfnodes, fq_from, fq_to)
103
+    .build_sgedge2rsgedge_map_from_revfactor(revfactor, gene_id, from, to)
104
+}
105
+
87 106
 ### 'edges' must be a GRanges object.
88 107
 .reduce_edges <- function(edges, f)
89 108
 {
... ...
@@ -136,9 +155,9 @@
136 155
     stopifnot(identical(edges_tx_id, edges_tx_id[sm]))  # sanity check
137 156
     ans_tx_id <- edges_tx_id[keep_idx]
138 157
 
139
-    ans_mcols <- DataFrame(rsgedge_id=levels(f),
140
-                           from=ans_from,
158
+    ans_mcols <- DataFrame(from=ans_from,
141 159
                            to=ans_to,
160
+                           rsgedge_id=levels(f),
142 161
                            ex_or_in=ans_ex_or_in,
143 162
                            tx_id=ans_tx_id)
144 163
 
... ...
@@ -244,9 +263,9 @@ setMethod("rsgedgesByGene", "SplicingGraphs",
244 263
         gene_id <- names(edges0)
245 264
         from <- edges0_mcols[ , "from"]
246 265
         to <- edges0_mcols[ , "to"]
247
-        ui_fqnodes <- .uninformative_fqnodes(x)
248
-        sgedge2rsgedge_map <- .build_sgedge2rsgedge_map(gene_id, from, to,
249
-                                                        ui_fqnodes)
266
+        uninfnodes <- .get_fq_uninfnodes(x)
267
+        sgedge2rsgedge_map <- .build_sgedge2rsgedge_map(uninfnodes,
268
+                                                        gene_id, from, to)
250 269
         f <- factor(sgedge2rsgedge_map, levels=unique(sgedge2rsgedge_map))
251 270
         ans_flesh <- .reduce_edges(edges0, f)
252 271
         ans_flesh_names <- Rle(names(ans_flesh))
... ...
@@ -261,7 +280,7 @@ setMethod("rsgedgesByGene", "SplicingGraphs",
261 280
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
262 281
 ### rsgedges() extractor
263 282
 ###
264
-### Same as sgedges() except that uninformative nodes (i.e. SSids) are removed.
283
+### Same as sgedges() except that uninformative nodes are removed.
265 284
 ###
266 285
 
267 286
 ### 'sgedges' must be a DataFrame as returned by:
... ...
@@ -278,15 +297,47 @@ setMethod("rsgedgesByGene", "SplicingGraphs",
278 297
              "to rsgedges()?")
279 298
     levels(ex_or_in) <- EX_OR_IN_LEVELS2
280 299
     uninformative_SSids <- uninformativeSSids(sgedges)
281
-    if (length(uninformative_SSids) == 0L)
300
+    if (length(uninformative_SSids) == 0L) {
301
+        col_idx <- match("sgedge_id", colnames(sgedges))
302
+        colnames(sgedges)[col_idx] <- "rsgedge_id"
303
+        sgedges$ex_or_in <- ex_or_in
282 304
         return(sgedges)
305
+    }
306
+
283 307
     from <- sgedges[ , "from"]
284 308
     to <- sgedges[ , "to"]
309
+    from_idx <- match(uninformative_SSids, from)
310
+    to_idx <- match(uninformative_SSids, to)
311
+    if (!all(from_idx == to_idx + 1L))
312
+        stop("Malformed input.\n",
313
+             "  In the input data.frame (or DataFrame) representing the ",
314
+             "original splicing graph, each uninformative splicing site ",
315
+             "id must appear in 2 consecutive rows (first in the \"to\" ",
316
+             "column, then in the \"from\" column. Could it be that the ",
317
+             "rows were subsetted before the data.frame (or DataFrame) ",
318
+             "was passed to rsgedges()?")
319
+
320
+    ## Reduce "from" and "to" cols.
321
+    ans_from <- from[-from_idx]
322
+    ans_to <- to[-to_idx]
323
+
324
+    ## Reduce "sgedge_id" col.
325
+    sgedges_id <- sgedges[ , "sgedge_id"]
326
+    tmp <- unlist(strsplit(sgedges_id, ":", fixed=TRUE), use.names=FALSE)
327
+    gene_id <- tmp[c(TRUE, FALSE)]
328
+    revfactor <- .make_revfactor_from_uninfnodes(uninformative_SSids, from, to)
329
+    sgedge2rsgedge_map <- .build_sgedge2rsgedge_map_from_revfactor(revfactor,
330
+                                                             gene_id, from, to)
331
+    f <- factor(sgedge2rsgedge_map, levels=unique(sgedge2rsgedge_map))
332
+    ans_rsgedge_id <- levels(f)
333
+
334
+    ## Reduce "ex_or_in" col.
335
+    ex_or_in[from_idx] <- EX_OR_IN_LEVELS2[4L]
336
+    ans_ex_or_in <- ex_or_in[-to_idx]
337
+
338
+    ## Reduce "tx_id" col.
285 339
     tx_id <- sgedges[ , "tx_id"]
286
-    idx1 <- match(uninformative_SSids, from)
287
-    idx2 <- match(uninformative_SSids, to)
288
-    ## 2 sanity checks.
289
-    if (!identical(unname(tx_id[idx1]), unname(tx_id[idx2])))
340
+    if (!identical(unname(tx_id[from_idx]), unname(tx_id[to_idx])))
290 341
         stop("Malformed input.\n",
291 342
              "  In the input data.frame (or DataFrame) representing the ",
292 343
              "original splicing graph, the 2 rows containing a given ",
... ...
@@ -294,20 +345,13 @@ setMethod("rsgedgesByGene", "SplicingGraphs",
294 345
              "Could it be that the \"tx_id\" column was manually altered ",
295 346
              "before the data.frame (or DataFrame) was passed to ",
296 347
              "rsgedges()?")
297
-    if (!all(idx1 == idx2 + 1L))
298
-        stop("Malformed input.\n",
299
-             "  In the input data.frame (or DataFrame) representing the ",
300
-             "original splicing graph, each uninformative splicing site ",
301
-             "id must appear in 2 consecutive rows (first in the \"to\" ",
302
-             "column, then in the \"from\" column. Could it be that the ",
303
-             "rows were subsetted before the data.frame (or DataFrame) ",
304
-             "was passed to rsgedges()?")
305
-    from <- from[-idx1]
306
-    to <- to[-idx2]
307
-    ex_or_in[idx1] <- EX_OR_IN_LEVELS2[4L]
308
-    ex_or_in <- ex_or_in[-idx2]
309
-    tx_id <- tx_id[-idx1]
310
-    DataFrame(from=from, to=to, ex_or_in=ex_or_in, tx_id=tx_id)
348
+    ans_tx_id <- tx_id[-from_idx]
349
+
350
+    DataFrame(from=ans_from,
351
+              to=ans_to,
352
+              rsgedge_id=ans_rsgedge_id,
353
+              ex_or_in=ans_ex_or_in,
354
+              tx_id=ans_tx_id)
311 355
 }
312 356
 
313 357
 rsgedges <- function(x)
... ...
@@ -22,7 +22,7 @@
22 22
 ### transcript) for a given gene. Should have been obtained thru the txpath()
23 23
 ### accessor. Returns a 4-col (or 5-col if 'txweight' is supplied) data.frame
24 24
 ### representing the splicing graph.
25
-.make_sgedges0_from_txpath <- function(txpath, txweight=NULL)
25
+.make_sgedges0_from_txpath <- function(txpath, gene_id, txweight=NULL)
26 26
 {
27 27
     if (!is.null(txweight)) {
28 28
         if (!is.numeric(txweight))
... ...
@@ -40,6 +40,8 @@
40 40
                                      "an odd number of splicing site ids")
41 41
                             from <- c("R", txpath_i)
42 42
                             to <- c(txpath_i, "L")
43
+                            sgedge_id <- make_global_sgedge_id(gene_id,
44
+                                                               from, to)
43 45
                             nexons <- txpath_i_len %/% 2L
44 46
                             if (nexons == 0L) {
45 47
                                 ex_or_in <- EX_OR_IN_LEVELS[3L]
... ...
@@ -55,6 +57,7 @@
55 57
                                                levels=EX_OR_IN_LEVELS)
56 58
                             data.frame(from=from,
57 59
                                        to=to,
60
+                                       sgedge_id=sgedge_id,
58 61
                                        ex_or_in=ex_or_in,
59 62
                                        stringsAsFactors=FALSE)
60 63
                         })
... ...
@@ -63,7 +66,7 @@
63 66
     tx_id <- names(txpath)
64 67
     if (is.null(tx_id))
65 68
         tx_id <- seq_along(txpath)
66
-    tx_id <- rep.int(factor(tx_id, levels=tx_id), nedges_per_tx)
69
+    tx_id <- rep.int(tx_id, nedges_per_tx)
67 70
     sgedges0$tx_id <- tx_id
68 71
     if (!is.null(txweight))
69 72
         sgedges0$txweight <- rep.int(txweight, nedges_per_tx)
... ...
@@ -72,17 +75,15 @@
72 75
 
73 76
 ### Collapse the duplicated edges in 'sgedges0' into a DataFrame.
74 77
 ### We use a DataFrame instead of a data.frame because we want to store
75
-### the tx_id col in a CompressedFactorList (even though this container
76
-### doesn't formally exist and a CompressedIntegerList is actually what's
77
-### being used).
78
+### the "tx_id" col in a CharacterList.
78 79
 .make_sgedges_from_sgedges0 <- function(sgedges0, ex_hits=NULL, in_hits=NULL)
79 80
 {
80 81
     from <- sgedges0[ , "from"]
81 82
     to <- sgedges0[ , "to"]
83
+    sgedge_id <- sgedges0[ , "sgedge_id"]
82 84
     ex_or_in <- sgedges0[ , "ex_or_in"]
83 85
     tx_id <- sgedges0[ , "tx_id"]
84
-    edges <- paste(from, to, sep="~")
85
-    sm <- match(edges, edges)
86
+    sm <- match(sgedge_id, sgedge_id)
86 87
     if (!all(ex_or_in == ex_or_in[sm]))
87 88
         stop("invalid splicing graph")
88 89
     is_not_dup <- sm == seq_along(sm)
... ...
@@ -178,14 +179,14 @@ setMethod("sgedges", "SplicingGraphs",
178 179
     {
179 180
         if (!isTRUEorFALSE(keep.dup.edges))
180 181
             stop("'keep.dup.edges' must be TRUE or FALSE")
181
-        txpath <- txpath(x)
182
+        txpath <- txpath(x)  # fails if length(x) != 1
183
+        gene_id <- names(x)
182 184
         if (is.null(txweight))
183 185
             txweight <- txweight(x)
186
+        sgedges0 <- .make_sgedges0_from_txpath(txpath, gene_id,
187
+                                               txweight=txweight)
184 188
         if (keep.dup.edges)
185
-            return(sgedges(txpath, txweight=txweight,
186
-                                   keep.dup.edges=keep.dup.edges))
187
-        sgedges0 <- sgedges(txpath, txweight=txweight,
188
-                                    keep.dup.edges=TRUE)
189
+            return(sgedges0)
189 190
         exon_hits <- .extract_sgedges_exon_hits(x)
190 191
         intron_hits <- .extract_sgedges_intron_hits(x)
191 192
         ## FIXME: Once .extract_sgedges_intron_hits() is fixed, merge the
... ...
@@ -198,28 +199,6 @@ setMethod("sgedges", "SplicingGraphs",
198 199
     }
199 200
 )
200 201
 
201
-setMethod("sgedges", "IntegerList",
202
-    function(x, txweight=NULL, keep.dup.edges=FALSE)
203
-    {
204
-        sgedges0 <- .make_sgedges0_from_txpath(x, txweight=txweight)
205
-        sgedges(sgedges0, keep.dup.edges=keep.dup.edges)
206
-    }
207
-)
208
-
209
-setMethod("sgedges", "data.frame",
210
-    function(x, txweight=NULL, keep.dup.edges=FALSE)
211
-    {
212
-        if (!is.null(txweight))
213
-            stop("the 'txweight' arg is not supported ",
214
-                 "when 'x' is a data.frame")
215
-        if (!isTRUEorFALSE(keep.dup.edges))
216
-            stop("'keep.dup.edges' must be TRUE or FALSE")
217
-        if (keep.dup.edges)
218
-            return(x)  # no-op
219
-        .make_sgedges_from_sgedges0(x)
220
-    }
221
-)
222
-
223 202
 
224 203
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
225 204
 ### sgnodes() accessor
... ...
@@ -229,7 +208,7 @@ setGeneric("sgnodes", signature="x",
229 208
     function(x) standardGeneric("sgnodes")
230 209
 )
231 210
 
232
-setMethod("sgnodes", "ANY",
211
+setMethod("sgnodes", "SplicingGraphs",
233 212
     function(x)
234 213
     {
235 214
         txpath <- txpath(x)
... ...
@@ -270,7 +249,8 @@ setMethod("outdeg", "DataFrame",
270 249
     function(x)
271 250
     {
272 251
         sgnodes <- sgnodes(x)
273
-        ans <- countMatches(sgnodes, x[ , "from"])
252
+        m <- match(x[ , "from"], sgnodes)
253
+        ans <- tabulate(m, nbins=length(sgnodes))
274 254
         names(ans) <- sgnodes
275 255
         ans
276 256
     }
... ...
@@ -292,7 +272,8 @@ setMethod("indeg", "DataFrame",
292 272
     function(x)
293 273
     {
294 274
         sgnodes <- sgnodes(x)
295
-        ans <- countMatches(sgnodes, x[ , "to"])
275
+        m <- match(x[ , "to"], sgnodes)
276
+        ans <- tabulate(m, nbins=length(sgnodes))
296 277
         names(ans) <- sgnodes
297 278
         ans
298 279
     }
... ...
@@ -82,7 +82,6 @@ setMethod("sgedgesByTranscript", "SplicingGraphs",
82 82
         in_prepend_mcols$ex_or_in <- ex_or_in
83 83
 
84 84
         tx_id <- mcols(ex_by_tx)[ , "tx_id"]
85
-        tx_id <- factor(tx_id, levels=unique(tx_id))
86 85
         ex_prepend_mcols$tx_id <- rep.int(tx_id, nex_by_tx)
87 86
         in_prepend_mcols$tx_id <- rep.int(tx_id, nin_by_tx)
88 87
 
... ...
@@ -131,11 +130,14 @@ setMethod("sgedgesByTranscript", "SplicingGraphs",
131 130
         stopifnot(identical(ans_unlistData_end[minus_introns_idx] + 1L,
132 131
                             ans_unlistData_start[minus_introns_idx - 1L]))
133 132
 
134
-        ## Add "sgedge_id" metadata col.
135
-        sgedge_id <- paste0(rep.int(gene_ids, width(ans_partitioning)), ":",
136
-                            from, ",", to)
137
-        ans_unlistData_mcols <- cbind(DataFrame(sgedge_id=sgedge_id),
138
-                                      ans_unlistData_mcols)
133
+        ## Insert "sgedge_id" metadata col after first 2 metadata cols
134
+        ## ("from" and "to").
135
+        sgedge_id <- make_global_sgedge_id(
136
+                         rep.int(gene_ids, width(ans_partitioning)),
137
+                         from, to)
138
+        ans_unlistData_mcols <- c(ans_unlistData_mcols[1:2],
139
+                                  DataFrame(sgedge_id=sgedge_id),
140
+                                  ans_unlistData_mcols[-(1:2)])
139 141
         check_all_edge_mcolnames(colnames(ans_unlistData_mcols))
140 142
 
141 143
         ## Drop unwanted columns.
... ...
@@ -98,13 +98,13 @@ commonStrand.GRangesList <- function(x)
98 98
 EXON_MCOLS <- c("exon_id", "exon_name", "exon_rank", "start_SSid", "end_SSid")
99 99
 
100 100
 ### All edge metadata columns.
101
-ALL_EDGE_MCOLS <- c("sgedge_id", "from", "to", "ex_or_in", "tx_id", EXON_MCOLS)
101
+ALL_EDGE_MCOLS <- c("from", "to", "sgedge_id", "ex_or_in", "tx_id", EXON_MCOLS)
102 102
 
103 103
 ### Subset of 'ALL_EDGE_MCOLS' made of those columns that are considered
104 104
 ### invariant i.e. the values in them associated with the same sgedge_id
105 105
 ### (global edge id) should be the same. Note that we also include the
106 106
 ### "sgedge_id" col itself.
107
-INVARIANT_EDGE_MCOLS <- c("sgedge_id", "from", "to", "ex_or_in",
107
+INVARIANT_EDGE_MCOLS <- c("from", "to", "sgedge_id", "ex_or_in",
108 108
                           "start_SSid", "end_SSid")
109 109
 
110 110
 EX_OR_IN_LEVELS2 <- c("ex", "in", "", "mixed")
... ...
@@ -155,6 +155,23 @@ get_index_of_invariant_edge_mcols <- function(colnames)
155 155
 }
156 156
 
157 157
 
158
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
159
+### make_sgedge_id()
160
+###
161
+### Returns "global splicing graph edge id".
162
+###
163
+
164
+make_global_sgedge_id <- function(gene_id, from, to)
165
+{
166
+    ans_len <- length(from)
167
+    stopifnot(length(to) == ans_len)
168
+    stopifnot(length(gene_id) == 1L || length(gene_id) == ans_len)
169
+    if (ans_len == 0L)
170
+        return(character(0))
171
+    paste0(gene_id, ":", from, ",", to)
172
+}
173
+
174
+
158 175
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
159 176
 ### unlistAndSplit()
160 177
 ###
... ...
@@ -184,6 +201,55 @@ unlistAndSplit <- function(x, f, drop=FALSE)
184 201
 }
185 202
 
186 203
 
204
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
205
+### fancy_punion() -- fancy parallel union
206
+###
207
+### Expects 'x' and 'y' to be 2 list-like objects such that:
208
+###   (a) all the list elements in 'x' and 'y' are vector-like objects of the
209
+###       same class;
210
+###   (b) 'x' and 'y' have the same length and names;
211
+###   (c) 'x' and 'y' have the same shape i.e. for any valid 'i', 'x[[i]]'
212
+###       and 'y[[i]]' have the same length;
213
+###   (d) 'x' and 'y' have no zero-length list elements;
214
+###   (e) for any valid index 'i', 'y[[i]][-L_i]' is identical to 'x[[i]][-1]',
215
+###       where L_i is the length of 'y[[i]]' (and 'x[[i]]').
216
+### Performs an optimized 'mendoapply(union, x, y)'.
217
+###
218
+### Example:
219
+###
220
+###   > x <- IntegerList(c(12, 4, 9), 5, c(8, -2))
221
+###   > y <- IntegerList(c(4, 9, 8), 0, c(-2, 10))
222
+###   > fancy_punion(x, y)
223
+###   IntegerList of length 3
224
+###   [[1]] 12 4 9 8
225
+###   [[2]] 5 0
226
+###   [[3]] 8 -2 10
227
+###
228
+fancy_punion <- function(x, y)
229
+{
230
+    x_partitioning <- PartitioningByEnd(x)
231
+    y_partitioning <- PartitioningByEnd(y)
232
+    if (!identical(x_partitioning, y_partitioning))
233
+        stop("'x' and 'y' must have the same length, names, and shape")
234
+    starts <- start(x_partitioning)
235
+    ends <- end(x_partitioning)
236
+    if (any(ends - starts == -1L))
237
+        stop("'x' and 'y' have zero-length list elements")
238
+    x_flesh <- unlist(x, use.names=FALSE)
239
+    y_flesh <- unlist(y, use.names=FALSE)
240
+    if (!identical(x_flesh[-starts], y_flesh[-ends]))
241
+        stop("for any valid index 'i', 'y[[i]][-length(y[[i]])]' ",
242
+             "must be identical to 'x[[i]][-1]'")
243
+    ans_breakpoints <- ends + seq_along(ends)
244
+    ans_flesh <- c(x_flesh, y_flesh[ends])
245
+    ans_flesh[-ans_breakpoints] <- x_flesh
246
+    ans_flesh[ans_breakpoints] <- y_flesh[ends]
247
+    ans_skeleton <- PartitioningByEnd(ans_breakpoints,
248
+                                      names=names(x_partitioning))
249
+    relist(ans_flesh, ans_skeleton)
250
+}
251
+
252
+
187 253
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
188 254
 ### uninformative_sgnodes()
189 255
 ###
... ...
@@ -543,6 +543,8 @@ The \Rclass{DataFrame} object returned by \Rfunction{sgedges} has the
543 543
 following columns:
544 544
 \begin{itemize}
545 545
   \item \Rcode{from}, \Rcode{to}: The 2 nodes connected by the edge.
546
+  \item \Rcode{sgedge\_id}: A \textit{global edge id} of the form
547
+        \Rcode{gene\_id:from,to}.
546 548
   \item \Rcode{ex\_or\_in}: The type of the edge, i.e., exon, intron, or
547 549
         no type if it's an artificial edge.
548 550
   \item \Rcode{tx\_id}: The ids of the transcripts that support the edge.
... ...
@@ -566,17 +568,11 @@ edges_by_gene[["107328"]]
566 568
 @
567 569
 \end{small}
568 570
 
569
-The edge-level metadata columns are the following:
570
-\begin{itemize}
571
-  \item \Rcode{sgedge\_id}: A \textit{global edge id} of the form
572
-        \Rcode{gene\_id:from,to}.
573
-  \item \Rcode{from}, \Rcode{to}, \Rcode{ex\_or\_in}, \Rcode{tx\_id}:
574
-        See the columns of the \Rclass{DataFrame} object returned
575
-        by \Rfunction{sgedges}.
576
-\end{itemize}
577
-
578
-The artificial edges (i.e., edges starting from the root
579
-node (\Rcode{R}) or ending at the leaf node (\Rcode{L})) are omitted!
571
+The edge-level metadata columns are the same as the columns of the
572
+\Rclass{DataFrame} object returned by \Rfunction{sgedges}.
573
+An important difference though is that the artificial edges (i.e., edges
574
+starting from the root node (\Rcode{R}) or ending at the leaf node
575
+(\Rcode{L})) are omitted!
580 576
 
581 577
 Finally, to plot a given splicing graph:
582 578
 
... ...
@@ -99,10 +99,19 @@ countReads(x, by=c("sgedge", "rsgedge"))
99 99
   the reads assigned to it.
100 100
 
101 101
   For \code{countReads}: a \link[IRanges]{DataFrame} object with one row
102
-  per unique splicing graph edge and one column of counts per sample.
103
-  Two additional columns (\code{"sgedge_id"} and \code{"ex_or_in"}) contain
104
-  the splicing graph edge ids and the type of edge (exon or intron),
105
-  respectively.
102
+  per unique splicing graph edge if \code{by="sgedge"}, or one row per
103
+  unique reduced splicing graph edge if \code{by="rsgedge"}.
104
+  There is one column of counts per sample, and the following two
105
+  additional columns:
106
+  \enumerate{
107
+    \item \code{"sgedge_id"} if \code{by="sgedge"} or \code{"rsgedge_id"}
108
+          if \code{by="rsgedge"}: contains the \emph{global splicing graph
109
+          edge ids} if \code{by="sgedge"}, or the \emph{global reduced
110
+          splicing graph edge ids} if \code{by="rsgedge"}.
111
+    \item \code{"ex_or_in"}: the type of edge. This can be exon or intron
112
+          if \code{by="sgedge"}, or exon, intron, or mixed if
113
+          \code{by="rsgedge"}.
114
+  }
106 115
 }
107 116
 
108 117
 \author{
... ...
@@ -119,6 +119,8 @@ edges_by_gene[["geneA"]]
119 119
 ## "geneA:2,4" (intron), and "geneA:4,5" (exon), during the graph
120 120
 ## reduction.
121 121
 
122
+stopifnot(identical(edges_by_gene["geneB"], rsgedgesByGene(sg["geneB"])))
123
+
122 124
 ## ---------------------------------------------------------------------
123 125
 ## 3. sgedgesByTranscript()
124 126
 ## ---------------------------------------------------------------------
... ...
@@ -4,11 +4,9 @@
4 4
 
5 5
 \alias{sgedges}
6 6
 \alias{sgedges,SplicingGraphs-method}
7
-\alias{sgedges,IntegerList-method}
8
-\alias{sgedges,data.frame-method}
9 7
 
10 8
 \alias{sgnodes}
11
-\alias{sgnodes,ANY-method}
9
+\alias{sgnodes,SplicingGraphs-method}
12 10
 \alias{sgnodes,IntegerList-method}
13 11
 \alias{sgnodes,data.frame-method}
14 12
 \alias{sgnodes,DataFrame-method}
... ...
@@ -94,7 +92,6 @@ check_way1_vs_way2 <- function(res1, res2)
94 92
 {
95 93
     edges1 <- res1[res1$ex_or_in != "", ]  # remove artificial edges
96 94
     edges2 <- mcols(unlist(res2, use.names=FALSE))
97
-    edges2 <- edges2[ , -1]  # remove "sgedge_id" col (global edge id)
98 95
     stopifnot(identical(edges1, edges2))
99 96
 }
100 97