- By default, the output of sgedges() now also includes the "sgedge_id" col,
so it has the same cols as the edge-level metadata cols returned by default
by sgedgesByTranscript() and sgedgesByGene().
- Change the order of the edge-level metadata cols in the GRangesList object
returned by rsgedgesByGene(): the "rsgedge_id" col (global reduced splicing
graph edge id) now is also in 3rd position (instead of 1st), after the
"from" and "to" cols.
- By default, the output of rsgedges() now also includes the "rsgedge_id" col,
so it has the same cols as the edge-level metadata cols returned by default
by rsgedgesByGene().
- The "tx_id" col in the output of all the above functions now is a
CharacterList instead of a broken IntegerList that was holding a factor in
its unlistData slot.
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SplicingGraphs@75803 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -6,8 +6,8 @@ |
6 | 6 |
### *reduced* splicing graphs. |
7 | 7 |
### |
8 | 8 |
|
9 |
-### Return the uninformative fully qualified nodes. |
|
10 |
-.uninformative_fqnodes <- function(sg) |
|
9 |
+### Return the fully qualified uninformative nodes. |
|
10 |
+.get_fq_uninfnodes <- function(sg) |
|
11 | 11 |
{ |
12 | 12 |
txpath <- txpath(unlist(sg)) |
13 | 13 |
skeleton2 <- PartitioningByEnd(end(PartitioningByEnd(txpath)) * 2L) |
... | ... |
@@ -25,65 +25,84 @@ |
25 | 25 |
gene_id <- gene_id[c(TRUE, FALSE)] |
26 | 26 |
from <- tmp[c(TRUE, FALSE)] |
27 | 27 |
to <- tmp[c(FALSE, TRUE)] |
28 |
- sgedge_id <- paste0(gene_id, ":", from, ",", to) |
|
28 |
+ sgedge_id <- make_global_sgedge_id(gene_id, from, to) |
|
29 | 29 |
keep_idx <- which(!duplicated(sgedge_id)) |
30 |
- fqfrom <- paste0(gene_id, ":", from)[keep_idx] # fully qualified ids |
|
31 |
- fqto <- paste0(gene_id, ":", to)[keep_idx] # fully qualified ids |
|
32 |
- uninformative_sgnodes(fqfrom, fqto) |
|
30 |
+ fq_from <- paste(gene_id, from, sep=":")[keep_idx] # fully qualified ids |
|
31 |
+ fq_to <- paste(gene_id, to, sep=":")[keep_idx] # fully qualified ids |
|
32 |
+ uninformative_sgnodes(fq_from, fq_to) |
|
33 | 33 |
} |
34 | 34 |
|
35 |
-.pmerge <- function(x, y) |
|
35 |
+### 'f': factor. The "reverse factor" of 'f' is the named list of integer |
|
36 |
+### vectors that maps each level of 'f' to the positions in 'f' where that |
|
37 |
+### level is used. It can quickly be computed with: |
|
38 |
+### |
|
39 |
+### revfactor <- split(seq_along(f), f) |
|
40 |
+### |
|
41 |
+### 'f' can be rebuilt from 'revfactor' with: |
|
42 |
+### |
|
43 |
+### f2 <- .make_factor_from_revfactor(revfactor, length(f)) |
|
44 |
+### stopifnot(identical(f2, f) |
|
45 |
+### |
|
46 |
+.make_factor_from_revfactor <- function(revfactor, f_len) |
|
36 | 47 |
{ |
37 |
- x_partitioning <- PartitioningByEnd(x) |
|
38 |
- y_partitioning <- PartitioningByEnd(y) |
|
39 |
- stopifnot(identical(x_partitioning, y_partitioning)) |
|
40 |
- starts <- start(x_partitioning) |
|
41 |
- ends <- end(x_partitioning) |
|
42 |
- unlisted_x <- unlist(x, use.names=FALSE) |
|
43 |
- unlisted_y <- unlist(y, use.names=FALSE) |
|
44 |
- stopifnot(identical(unlisted_x[-starts], unlisted_y[-ends])) |
|
45 |
- y <- as(unlisted_y[ends], "List") |
|
46 |
- xy <- c(x, y) |
|
47 |
- f <- rep.int(seq_along(x), 2L) |
|
48 |
- ans <- unlistAndSplit(xy, f) |
|
49 |
- names(ans) <- names(x_partitioning) |
|
50 |
- ans |
|
48 |
+ f_levels <- names(revfactor) |
|
49 |
+ idx <- integer(f_len) |
|
50 |
+ idx[] <- NA_integer_ |
|
51 |
+ idx[unlist(revfactor, use.names=FALSE)] <- rep.int(seq_along(revfactor), |
|
52 |
+ elementLengths(revfactor)) |
|
53 |
+ factor(f_levels[idx], levels=f_levels) |
|
54 |
+} |
|
55 |
+ |
|
56 |
+### 'from' and 'to' must have the same length N (nb of unique edges in the |
|
57 |
+### SplicingGraphs object before reduction). |
|
58 |
+.make_revfactor_from_uninfnodes <- function(uninfnodes, from, to) |
|
59 |
+{ |
|
60 |
+ from_idx <- match(uninfnodes, from) |
|
61 |
+ to_idx <- match(uninfnodes, to) |
|
62 |
+ keep_idx <- which(!(is.na(from_idx) | is.na(to_idx))) |
|
63 |
+ from_idx <- from_idx[keep_idx] |
|
64 |
+ to_idx <- to_idx[keep_idx] |
|
65 |
+ stopifnot(all(from_idx == to_idx + 1L)) # sanity check |
|
66 |
+ if (length(from_idx) == 0L) { |
|
67 |
+ group1 <- integer(0) |
|
68 |
+ } else { |
|
69 |
+ group1 <- cumsum(c(TRUE, diff(from_idx) != 1L)) |
|
70 |
+ } |
|
71 |
+ split_to_idx <- unname(splitAsList(to_idx, group1)) |
|
72 |
+ split_from_idx <- unname(splitAsList(from_idx, group1)) |
|
73 |
+ fancy_punion(split_to_idx, split_from_idx) |
|
51 | 74 |
} |
52 | 75 |
|
53 |
-### 'gene_id', 'from', and 'to', must have the same length N (nb of unique |
|
54 |
-### edges in the SplicingGraphs object befor reduction). |
|
76 |
+### 'revfactor' must be a "reverse factor" as returned by |
|
77 |
+### .make_revfactor_from_uninfnodes(). |
|
55 | 78 |
### Returns a character vector of length N containing the global rsgedge id |
56 | 79 |
### (global reduced splicing graph edge id) corresponding to each input edge. |
57 |
-.build_sgedge2rsgedge_map <- function(gene_id, from, to, ui_fqnodes) |
|
80 |
+.build_sgedge2rsgedge_map_from_revfactor <- function(revfactor, |
|
81 |
+ gene_id, from, to) |
|
58 | 82 |
{ |
59 |
- ans <- paste0(gene_id, ":", from, ",", to) |
|
60 |
- |
|
61 |
- fqfrom <- paste0(gene_id, ":", from) # fully qualified ids |
|
62 |
- fqto <- paste0(gene_id, ":", to) # fully qualified ids |
|
63 |
- idx1 <- match(ui_fqnodes, fqfrom) |
|
64 |
- idx2 <- match(ui_fqnodes, fqto) |
|
65 |
- keep_idx <- which(!(is.na(idx1) | is.na(idx2))) |
|
66 |
- idx1 <- idx1[keep_idx] |
|
67 |
- idx2 <- idx2[keep_idx] |
|
68 |
- stopifnot(all(idx1 == idx2 + 1L)) # sanity check |
|
69 |
- group1 <- cumsum(c(TRUE, diff(idx1) != 1L)) |
|
70 |
- split_idx2 <- unname(splitAsList(idx2, group1)) |
|
71 |
- split_idx1 <- unname(splitAsList(idx1, group1)) |
|
72 |
- idx <- .pmerge(split_idx2, split_idx1) |
|
73 |
- alter_idx <- idx@unlistData |
|
74 |
- |
|
75 |
- from_list <- relist(from[alter_idx], idx) |
|
76 |
- to_list <- relist(to[alter_idx], idx) |
|
77 |
- nodes_list <- .pmerge(from_list, to_list) |
|
83 |
+ ans <- make_global_sgedge_id(gene_id, from, to) |
|
84 |
+ |
|
85 |
+ unlisted_revfactor <- unlist(revfactor, use.names=FALSE) |
|
86 |
+ from_list <- relist(from[unlisted_revfactor], revfactor) |
|
87 |
+ to_list <- relist(to[unlisted_revfactor], revfactor) |
|
88 |
+ nodes_list <- fancy_punion(from_list, to_list) |
|
78 | 89 |
|
79 | 90 |
rsgedge_id <- sapply(nodes_list, paste0, collapse=",") |
80 |
- rsgedge_id <- rep.int(rsgedge_id, elementLengths(idx)) |
|
81 |
- rsgedge_id <- paste(gene_id[alter_idx], rsgedge_id, sep=":") |
|
91 |
+ rsgedge_id <- rep.int(rsgedge_id, elementLengths(revfactor)) |
|
92 |
+ rsgedge_id <- paste(gene_id[unlisted_revfactor], rsgedge_id, sep=":") |
|
82 | 93 |
|
83 |
- ans[alter_idx] <- rsgedge_id |
|
94 |
+ ans[unlisted_revfactor] <- rsgedge_id |
|
84 | 95 |
ans |
85 | 96 |
} |
86 | 97 |
|
98 |
+.build_sgedge2rsgedge_map <- function(uninfnodes, gene_id, from, to) |
|
99 |
+{ |
|
100 |
+ fq_from <- paste(gene_id, from, sep=":") # fully qualified ids |
|
101 |
+ fq_to <- paste(gene_id, to, sep=":") # fully qualified ids |
|
102 |
+ revfactor <- .make_revfactor_from_uninfnodes(uninfnodes, fq_from, fq_to) |
|
103 |
+ .build_sgedge2rsgedge_map_from_revfactor(revfactor, gene_id, from, to) |
|
104 |
+} |
|
105 |
+ |
|
87 | 106 |
### 'edges' must be a GRanges object. |
88 | 107 |
.reduce_edges <- function(edges, f) |
89 | 108 |
{ |
... | ... |
@@ -136,9 +155,9 @@ |
136 | 155 |
stopifnot(identical(edges_tx_id, edges_tx_id[sm])) # sanity check |
137 | 156 |
ans_tx_id <- edges_tx_id[keep_idx] |
138 | 157 |
|
139 |
- ans_mcols <- DataFrame(rsgedge_id=levels(f), |
|
140 |
- from=ans_from, |
|
158 |
+ ans_mcols <- DataFrame(from=ans_from, |
|
141 | 159 |
to=ans_to, |
160 |
+ rsgedge_id=levels(f), |
|
142 | 161 |
ex_or_in=ans_ex_or_in, |
143 | 162 |
tx_id=ans_tx_id) |
144 | 163 |
|
... | ... |
@@ -244,9 +263,9 @@ setMethod("rsgedgesByGene", "SplicingGraphs", |
244 | 263 |
gene_id <- names(edges0) |
245 | 264 |
from <- edges0_mcols[ , "from"] |
246 | 265 |
to <- edges0_mcols[ , "to"] |
247 |
- ui_fqnodes <- .uninformative_fqnodes(x) |
|
248 |
- sgedge2rsgedge_map <- .build_sgedge2rsgedge_map(gene_id, from, to, |
|
249 |
- ui_fqnodes) |
|
266 |
+ uninfnodes <- .get_fq_uninfnodes(x) |
|
267 |
+ sgedge2rsgedge_map <- .build_sgedge2rsgedge_map(uninfnodes, |
|
268 |
+ gene_id, from, to) |
|
250 | 269 |
f <- factor(sgedge2rsgedge_map, levels=unique(sgedge2rsgedge_map)) |
251 | 270 |
ans_flesh <- .reduce_edges(edges0, f) |
252 | 271 |
ans_flesh_names <- Rle(names(ans_flesh)) |
... | ... |
@@ -261,7 +280,7 @@ setMethod("rsgedgesByGene", "SplicingGraphs", |
261 | 280 |
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
262 | 281 |
### rsgedges() extractor |
263 | 282 |
### |
264 |
-### Same as sgedges() except that uninformative nodes (i.e. SSids) are removed. |
|
283 |
+### Same as sgedges() except that uninformative nodes are removed. |
|
265 | 284 |
### |
266 | 285 |
|
267 | 286 |
### 'sgedges' must be a DataFrame as returned by: |
... | ... |
@@ -278,15 +297,47 @@ setMethod("rsgedgesByGene", "SplicingGraphs", |
278 | 297 |
"to rsgedges()?") |
279 | 298 |
levels(ex_or_in) <- EX_OR_IN_LEVELS2 |
280 | 299 |
uninformative_SSids <- uninformativeSSids(sgedges) |
281 |
- if (length(uninformative_SSids) == 0L) |
|
300 |
+ if (length(uninformative_SSids) == 0L) { |
|
301 |
+ col_idx <- match("sgedge_id", colnames(sgedges)) |
|
302 |
+ colnames(sgedges)[col_idx] <- "rsgedge_id" |
|
303 |
+ sgedges$ex_or_in <- ex_or_in |
|
282 | 304 |
return(sgedges) |
305 |
+ } |
|
306 |
+ |
|
283 | 307 |
from <- sgedges[ , "from"] |
284 | 308 |
to <- sgedges[ , "to"] |
309 |
+ from_idx <- match(uninformative_SSids, from) |
|
310 |
+ to_idx <- match(uninformative_SSids, to) |
|
311 |
+ if (!all(from_idx == to_idx + 1L)) |
|
312 |
+ stop("Malformed input.\n", |
|
313 |
+ " In the input data.frame (or DataFrame) representing the ", |
|
314 |
+ "original splicing graph, each uninformative splicing site ", |
|
315 |
+ "id must appear in 2 consecutive rows (first in the \"to\" ", |
|
316 |
+ "column, then in the \"from\" column. Could it be that the ", |
|
317 |
+ "rows were subsetted before the data.frame (or DataFrame) ", |
|
318 |
+ "was passed to rsgedges()?") |
|
319 |
+ |
|
320 |
+ ## Reduce "from" and "to" cols. |
|
321 |
+ ans_from <- from[-from_idx] |
|
322 |
+ ans_to <- to[-to_idx] |
|
323 |
+ |
|
324 |
+ ## Reduce "sgedge_id" col. |
|
325 |
+ sgedges_id <- sgedges[ , "sgedge_id"] |
|
326 |
+ tmp <- unlist(strsplit(sgedges_id, ":", fixed=TRUE), use.names=FALSE) |
|
327 |
+ gene_id <- tmp[c(TRUE, FALSE)] |
|
328 |
+ revfactor <- .make_revfactor_from_uninfnodes(uninformative_SSids, from, to) |
|
329 |
+ sgedge2rsgedge_map <- .build_sgedge2rsgedge_map_from_revfactor(revfactor, |
|
330 |
+ gene_id, from, to) |
|
331 |
+ f <- factor(sgedge2rsgedge_map, levels=unique(sgedge2rsgedge_map)) |
|
332 |
+ ans_rsgedge_id <- levels(f) |
|
333 |
+ |
|
334 |
+ ## Reduce "ex_or_in" col. |
|
335 |
+ ex_or_in[from_idx] <- EX_OR_IN_LEVELS2[4L] |
|
336 |
+ ans_ex_or_in <- ex_or_in[-to_idx] |
|
337 |
+ |
|
338 |
+ ## Reduce "tx_id" col. |
|
285 | 339 |
tx_id <- sgedges[ , "tx_id"] |
286 |
- idx1 <- match(uninformative_SSids, from) |
|
287 |
- idx2 <- match(uninformative_SSids, to) |
|
288 |
- ## 2 sanity checks. |
|
289 |
- if (!identical(unname(tx_id[idx1]), unname(tx_id[idx2]))) |
|
340 |
+ if (!identical(unname(tx_id[from_idx]), unname(tx_id[to_idx]))) |
|
290 | 341 |
stop("Malformed input.\n", |
291 | 342 |
" In the input data.frame (or DataFrame) representing the ", |
292 | 343 |
"original splicing graph, the 2 rows containing a given ", |
... | ... |
@@ -294,20 +345,13 @@ setMethod("rsgedgesByGene", "SplicingGraphs", |
294 | 345 |
"Could it be that the \"tx_id\" column was manually altered ", |
295 | 346 |
"before the data.frame (or DataFrame) was passed to ", |
296 | 347 |
"rsgedges()?") |
297 |
- if (!all(idx1 == idx2 + 1L)) |
|
298 |
- stop("Malformed input.\n", |
|
299 |
- " In the input data.frame (or DataFrame) representing the ", |
|
300 |
- "original splicing graph, each uninformative splicing site ", |
|
301 |
- "id must appear in 2 consecutive rows (first in the \"to\" ", |
|
302 |
- "column, then in the \"from\" column. Could it be that the ", |
|
303 |
- "rows were subsetted before the data.frame (or DataFrame) ", |
|
304 |
- "was passed to rsgedges()?") |
|
305 |
- from <- from[-idx1] |
|
306 |
- to <- to[-idx2] |
|
307 |
- ex_or_in[idx1] <- EX_OR_IN_LEVELS2[4L] |
|
308 |
- ex_or_in <- ex_or_in[-idx2] |
|
309 |
- tx_id <- tx_id[-idx1] |
|
310 |
- DataFrame(from=from, to=to, ex_or_in=ex_or_in, tx_id=tx_id) |
|
348 |
+ ans_tx_id <- tx_id[-from_idx] |
|
349 |
+ |
|
350 |
+ DataFrame(from=ans_from, |
|
351 |
+ to=ans_to, |
|
352 |
+ rsgedge_id=ans_rsgedge_id, |
|
353 |
+ ex_or_in=ans_ex_or_in, |
|
354 |
+ tx_id=ans_tx_id) |
|
311 | 355 |
} |
312 | 356 |
|
313 | 357 |
rsgedges <- function(x) |
... | ... |
@@ -22,7 +22,7 @@ |
22 | 22 |
### transcript) for a given gene. Should have been obtained thru the txpath() |
23 | 23 |
### accessor. Returns a 4-col (or 5-col if 'txweight' is supplied) data.frame |
24 | 24 |
### representing the splicing graph. |
25 |
-.make_sgedges0_from_txpath <- function(txpath, txweight=NULL) |
|
25 |
+.make_sgedges0_from_txpath <- function(txpath, gene_id, txweight=NULL) |
|
26 | 26 |
{ |
27 | 27 |
if (!is.null(txweight)) { |
28 | 28 |
if (!is.numeric(txweight)) |
... | ... |
@@ -40,6 +40,8 @@ |
40 | 40 |
"an odd number of splicing site ids") |
41 | 41 |
from <- c("R", txpath_i) |
42 | 42 |
to <- c(txpath_i, "L") |
43 |
+ sgedge_id <- make_global_sgedge_id(gene_id, |
|
44 |
+ from, to) |
|
43 | 45 |
nexons <- txpath_i_len %/% 2L |
44 | 46 |
if (nexons == 0L) { |
45 | 47 |
ex_or_in <- EX_OR_IN_LEVELS[3L] |
... | ... |
@@ -55,6 +57,7 @@ |
55 | 57 |
levels=EX_OR_IN_LEVELS) |
56 | 58 |
data.frame(from=from, |
57 | 59 |
to=to, |
60 |
+ sgedge_id=sgedge_id, |
|
58 | 61 |
ex_or_in=ex_or_in, |
59 | 62 |
stringsAsFactors=FALSE) |
60 | 63 |
}) |
... | ... |
@@ -63,7 +66,7 @@ |
63 | 66 |
tx_id <- names(txpath) |
64 | 67 |
if (is.null(tx_id)) |
65 | 68 |
tx_id <- seq_along(txpath) |
66 |
- tx_id <- rep.int(factor(tx_id, levels=tx_id), nedges_per_tx) |
|
69 |
+ tx_id <- rep.int(tx_id, nedges_per_tx) |
|
67 | 70 |
sgedges0$tx_id <- tx_id |
68 | 71 |
if (!is.null(txweight)) |
69 | 72 |
sgedges0$txweight <- rep.int(txweight, nedges_per_tx) |
... | ... |
@@ -72,17 +75,15 @@ |
72 | 75 |
|
73 | 76 |
### Collapse the duplicated edges in 'sgedges0' into a DataFrame. |
74 | 77 |
### We use a DataFrame instead of a data.frame because we want to store |
75 |
-### the tx_id col in a CompressedFactorList (even though this container |
|
76 |
-### doesn't formally exist and a CompressedIntegerList is actually what's |
|
77 |
-### being used). |
|
78 |
+### the "tx_id" col in a CharacterList. |
|
78 | 79 |
.make_sgedges_from_sgedges0 <- function(sgedges0, ex_hits=NULL, in_hits=NULL) |
79 | 80 |
{ |
80 | 81 |
from <- sgedges0[ , "from"] |
81 | 82 |
to <- sgedges0[ , "to"] |
83 |
+ sgedge_id <- sgedges0[ , "sgedge_id"] |
|
82 | 84 |
ex_or_in <- sgedges0[ , "ex_or_in"] |
83 | 85 |
tx_id <- sgedges0[ , "tx_id"] |
84 |
- edges <- paste(from, to, sep="~") |
|
85 |
- sm <- match(edges, edges) |
|
86 |
+ sm <- match(sgedge_id, sgedge_id) |
|
86 | 87 |
if (!all(ex_or_in == ex_or_in[sm])) |
87 | 88 |
stop("invalid splicing graph") |
88 | 89 |
is_not_dup <- sm == seq_along(sm) |
... | ... |
@@ -178,14 +179,14 @@ setMethod("sgedges", "SplicingGraphs", |
178 | 179 |
{ |
179 | 180 |
if (!isTRUEorFALSE(keep.dup.edges)) |
180 | 181 |
stop("'keep.dup.edges' must be TRUE or FALSE") |
181 |
- txpath <- txpath(x) |
|
182 |
+ txpath <- txpath(x) # fails if length(x) != 1 |
|
183 |
+ gene_id <- names(x) |
|
182 | 184 |
if (is.null(txweight)) |
183 | 185 |
txweight <- txweight(x) |
186 |
+ sgedges0 <- .make_sgedges0_from_txpath(txpath, gene_id, |
|
187 |
+ txweight=txweight) |
|
184 | 188 |
if (keep.dup.edges) |
185 |
- return(sgedges(txpath, txweight=txweight, |
|
186 |
- keep.dup.edges=keep.dup.edges)) |
|
187 |
- sgedges0 <- sgedges(txpath, txweight=txweight, |
|
188 |
- keep.dup.edges=TRUE) |
|
189 |
+ return(sgedges0) |
|
189 | 190 |
exon_hits <- .extract_sgedges_exon_hits(x) |
190 | 191 |
intron_hits <- .extract_sgedges_intron_hits(x) |
191 | 192 |
## FIXME: Once .extract_sgedges_intron_hits() is fixed, merge the |
... | ... |
@@ -198,28 +199,6 @@ setMethod("sgedges", "SplicingGraphs", |
198 | 199 |
} |
199 | 200 |
) |
200 | 201 |
|
201 |
-setMethod("sgedges", "IntegerList", |
|
202 |
- function(x, txweight=NULL, keep.dup.edges=FALSE) |
|
203 |
- { |
|
204 |
- sgedges0 <- .make_sgedges0_from_txpath(x, txweight=txweight) |
|
205 |
- sgedges(sgedges0, keep.dup.edges=keep.dup.edges) |
|
206 |
- } |
|
207 |
-) |
|
208 |
- |
|
209 |
-setMethod("sgedges", "data.frame", |
|
210 |
- function(x, txweight=NULL, keep.dup.edges=FALSE) |
|
211 |
- { |
|
212 |
- if (!is.null(txweight)) |
|
213 |
- stop("the 'txweight' arg is not supported ", |
|
214 |
- "when 'x' is a data.frame") |
|
215 |
- if (!isTRUEorFALSE(keep.dup.edges)) |
|
216 |
- stop("'keep.dup.edges' must be TRUE or FALSE") |
|
217 |
- if (keep.dup.edges) |
|
218 |
- return(x) # no-op |
|
219 |
- .make_sgedges_from_sgedges0(x) |
|
220 |
- } |
|
221 |
-) |
|
222 |
- |
|
223 | 202 |
|
224 | 203 |
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
225 | 204 |
### sgnodes() accessor |
... | ... |
@@ -229,7 +208,7 @@ setGeneric("sgnodes", signature="x", |
229 | 208 |
function(x) standardGeneric("sgnodes") |
230 | 209 |
) |
231 | 210 |
|
232 |
-setMethod("sgnodes", "ANY", |
|
211 |
+setMethod("sgnodes", "SplicingGraphs", |
|
233 | 212 |
function(x) |
234 | 213 |
{ |
235 | 214 |
txpath <- txpath(x) |
... | ... |
@@ -270,7 +249,8 @@ setMethod("outdeg", "DataFrame", |
270 | 249 |
function(x) |
271 | 250 |
{ |
272 | 251 |
sgnodes <- sgnodes(x) |
273 |
- ans <- countMatches(sgnodes, x[ , "from"]) |
|
252 |
+ m <- match(x[ , "from"], sgnodes) |
|
253 |
+ ans <- tabulate(m, nbins=length(sgnodes)) |
|
274 | 254 |
names(ans) <- sgnodes |
275 | 255 |
ans |
276 | 256 |
} |
... | ... |
@@ -292,7 +272,8 @@ setMethod("indeg", "DataFrame", |
292 | 272 |
function(x) |
293 | 273 |
{ |
294 | 274 |
sgnodes <- sgnodes(x) |
295 |
- ans <- countMatches(sgnodes, x[ , "to"]) |
|
275 |
+ m <- match(x[ , "to"], sgnodes) |
|
276 |
+ ans <- tabulate(m, nbins=length(sgnodes)) |
|
296 | 277 |
names(ans) <- sgnodes |
297 | 278 |
ans |
298 | 279 |
} |
... | ... |
@@ -82,7 +82,6 @@ setMethod("sgedgesByTranscript", "SplicingGraphs", |
82 | 82 |
in_prepend_mcols$ex_or_in <- ex_or_in |
83 | 83 |
|
84 | 84 |
tx_id <- mcols(ex_by_tx)[ , "tx_id"] |
85 |
- tx_id <- factor(tx_id, levels=unique(tx_id)) |
|
86 | 85 |
ex_prepend_mcols$tx_id <- rep.int(tx_id, nex_by_tx) |
87 | 86 |
in_prepend_mcols$tx_id <- rep.int(tx_id, nin_by_tx) |
88 | 87 |
|
... | ... |
@@ -131,11 +130,14 @@ setMethod("sgedgesByTranscript", "SplicingGraphs", |
131 | 130 |
stopifnot(identical(ans_unlistData_end[minus_introns_idx] + 1L, |
132 | 131 |
ans_unlistData_start[minus_introns_idx - 1L])) |
133 | 132 |
|
134 |
- ## Add "sgedge_id" metadata col. |
|
135 |
- sgedge_id <- paste0(rep.int(gene_ids, width(ans_partitioning)), ":", |
|
136 |
- from, ",", to) |
|
137 |
- ans_unlistData_mcols <- cbind(DataFrame(sgedge_id=sgedge_id), |
|
138 |
- ans_unlistData_mcols) |
|
133 |
+ ## Insert "sgedge_id" metadata col after first 2 metadata cols |
|
134 |
+ ## ("from" and "to"). |
|
135 |
+ sgedge_id <- make_global_sgedge_id( |
|
136 |
+ rep.int(gene_ids, width(ans_partitioning)), |
|
137 |
+ from, to) |
|
138 |
+ ans_unlistData_mcols <- c(ans_unlistData_mcols[1:2], |
|
139 |
+ DataFrame(sgedge_id=sgedge_id), |
|
140 |
+ ans_unlistData_mcols[-(1:2)]) |
|
139 | 141 |
check_all_edge_mcolnames(colnames(ans_unlistData_mcols)) |
140 | 142 |
|
141 | 143 |
## Drop unwanted columns. |
... | ... |
@@ -98,13 +98,13 @@ commonStrand.GRangesList <- function(x) |
98 | 98 |
EXON_MCOLS <- c("exon_id", "exon_name", "exon_rank", "start_SSid", "end_SSid") |
99 | 99 |
|
100 | 100 |
### All edge metadata columns. |
101 |
-ALL_EDGE_MCOLS <- c("sgedge_id", "from", "to", "ex_or_in", "tx_id", EXON_MCOLS) |
|
101 |
+ALL_EDGE_MCOLS <- c("from", "to", "sgedge_id", "ex_or_in", "tx_id", EXON_MCOLS) |
|
102 | 102 |
|
103 | 103 |
### Subset of 'ALL_EDGE_MCOLS' made of those columns that are considered |
104 | 104 |
### invariant i.e. the values in them associated with the same sgedge_id |
105 | 105 |
### (global edge id) should be the same. Note that we also include the |
106 | 106 |
### "sgedge_id" col itself. |
107 |
-INVARIANT_EDGE_MCOLS <- c("sgedge_id", "from", "to", "ex_or_in", |
|
107 |
+INVARIANT_EDGE_MCOLS <- c("from", "to", "sgedge_id", "ex_or_in", |
|
108 | 108 |
"start_SSid", "end_SSid") |
109 | 109 |
|
110 | 110 |
EX_OR_IN_LEVELS2 <- c("ex", "in", "", "mixed") |
... | ... |
@@ -155,6 +155,23 @@ get_index_of_invariant_edge_mcols <- function(colnames) |
155 | 155 |
} |
156 | 156 |
|
157 | 157 |
|
158 |
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
|
159 |
+### make_sgedge_id() |
|
160 |
+### |
|
161 |
+### Returns "global splicing graph edge id". |
|
162 |
+### |
|
163 |
+ |
|
164 |
+make_global_sgedge_id <- function(gene_id, from, to) |
|
165 |
+{ |
|
166 |
+ ans_len <- length(from) |
|
167 |
+ stopifnot(length(to) == ans_len) |
|
168 |
+ stopifnot(length(gene_id) == 1L || length(gene_id) == ans_len) |
|
169 |
+ if (ans_len == 0L) |
|
170 |
+ return(character(0)) |
|
171 |
+ paste0(gene_id, ":", from, ",", to) |
|
172 |
+} |
|
173 |
+ |
|
174 |
+ |
|
158 | 175 |
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
159 | 176 |
### unlistAndSplit() |
160 | 177 |
### |
... | ... |
@@ -184,6 +201,55 @@ unlistAndSplit <- function(x, f, drop=FALSE) |
184 | 201 |
} |
185 | 202 |
|
186 | 203 |
|
204 |
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
|
205 |
+### fancy_punion() -- fancy parallel union |
|
206 |
+### |
|
207 |
+### Expects 'x' and 'y' to be 2 list-like objects such that: |
|
208 |
+### (a) all the list elements in 'x' and 'y' are vector-like objects of the |
|
209 |
+### same class; |
|
210 |
+### (b) 'x' and 'y' have the same length and names; |
|
211 |
+### (c) 'x' and 'y' have the same shape i.e. for any valid 'i', 'x[[i]]' |
|
212 |
+### and 'y[[i]]' have the same length; |
|
213 |
+### (d) 'x' and 'y' have no zero-length list elements; |
|
214 |
+### (e) for any valid index 'i', 'y[[i]][-L_i]' is identical to 'x[[i]][-1]', |
|
215 |
+### where L_i is the length of 'y[[i]]' (and 'x[[i]]'). |
|
216 |
+### Performs an optimized 'mendoapply(union, x, y)'. |
|
217 |
+### |
|
218 |
+### Example: |
|
219 |
+### |
|
220 |
+### > x <- IntegerList(c(12, 4, 9), 5, c(8, -2)) |
|
221 |
+### > y <- IntegerList(c(4, 9, 8), 0, c(-2, 10)) |
|
222 |
+### > fancy_punion(x, y) |
|
223 |
+### IntegerList of length 3 |
|
224 |
+### [[1]] 12 4 9 8 |
|
225 |
+### [[2]] 5 0 |
|
226 |
+### [[3]] 8 -2 10 |
|
227 |
+### |
|
228 |
+fancy_punion <- function(x, y) |
|
229 |
+{ |
|
230 |
+ x_partitioning <- PartitioningByEnd(x) |
|
231 |
+ y_partitioning <- PartitioningByEnd(y) |
|
232 |
+ if (!identical(x_partitioning, y_partitioning)) |
|
233 |
+ stop("'x' and 'y' must have the same length, names, and shape") |
|
234 |
+ starts <- start(x_partitioning) |
|
235 |
+ ends <- end(x_partitioning) |
|
236 |
+ if (any(ends - starts == -1L)) |
|
237 |
+ stop("'x' and 'y' have zero-length list elements") |
|
238 |
+ x_flesh <- unlist(x, use.names=FALSE) |
|
239 |
+ y_flesh <- unlist(y, use.names=FALSE) |
|
240 |
+ if (!identical(x_flesh[-starts], y_flesh[-ends])) |
|
241 |
+ stop("for any valid index 'i', 'y[[i]][-length(y[[i]])]' ", |
|
242 |
+ "must be identical to 'x[[i]][-1]'") |
|
243 |
+ ans_breakpoints <- ends + seq_along(ends) |
|
244 |
+ ans_flesh <- c(x_flesh, y_flesh[ends]) |
|
245 |
+ ans_flesh[-ans_breakpoints] <- x_flesh |
|
246 |
+ ans_flesh[ans_breakpoints] <- y_flesh[ends] |
|
247 |
+ ans_skeleton <- PartitioningByEnd(ans_breakpoints, |
|
248 |
+ names=names(x_partitioning)) |
|
249 |
+ relist(ans_flesh, ans_skeleton) |
|
250 |
+} |
|
251 |
+ |
|
252 |
+ |
|
187 | 253 |
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
188 | 254 |
### uninformative_sgnodes() |
189 | 255 |
### |
... | ... |
@@ -543,6 +543,8 @@ The \Rclass{DataFrame} object returned by \Rfunction{sgedges} has the |
543 | 543 |
following columns: |
544 | 544 |
\begin{itemize} |
545 | 545 |
\item \Rcode{from}, \Rcode{to}: The 2 nodes connected by the edge. |
546 |
+ \item \Rcode{sgedge\_id}: A \textit{global edge id} of the form |
|
547 |
+ \Rcode{gene\_id:from,to}. |
|
546 | 548 |
\item \Rcode{ex\_or\_in}: The type of the edge, i.e., exon, intron, or |
547 | 549 |
no type if it's an artificial edge. |
548 | 550 |
\item \Rcode{tx\_id}: The ids of the transcripts that support the edge. |
... | ... |
@@ -566,17 +568,11 @@ edges_by_gene[["107328"]] |
566 | 568 |
@ |
567 | 569 |
\end{small} |
568 | 570 |
|
569 |
-The edge-level metadata columns are the following: |
|
570 |
-\begin{itemize} |
|
571 |
- \item \Rcode{sgedge\_id}: A \textit{global edge id} of the form |
|
572 |
- \Rcode{gene\_id:from,to}. |
|
573 |
- \item \Rcode{from}, \Rcode{to}, \Rcode{ex\_or\_in}, \Rcode{tx\_id}: |
|
574 |
- See the columns of the \Rclass{DataFrame} object returned |
|
575 |
- by \Rfunction{sgedges}. |
|
576 |
-\end{itemize} |
|
577 |
- |
|
578 |
-The artificial edges (i.e., edges starting from the root |
|
579 |
-node (\Rcode{R}) or ending at the leaf node (\Rcode{L})) are omitted! |
|
571 |
+The edge-level metadata columns are the same as the columns of the |
|
572 |
+\Rclass{DataFrame} object returned by \Rfunction{sgedges}. |
|
573 |
+An important difference though is that the artificial edges (i.e., edges |
|
574 |
+starting from the root node (\Rcode{R}) or ending at the leaf node |
|
575 |
+(\Rcode{L})) are omitted! |
|
580 | 576 |
|
581 | 577 |
Finally, to plot a given splicing graph: |
582 | 578 |
|
... | ... |
@@ -99,10 +99,19 @@ countReads(x, by=c("sgedge", "rsgedge")) |
99 | 99 |
the reads assigned to it. |
100 | 100 |
|
101 | 101 |
For \code{countReads}: a \link[IRanges]{DataFrame} object with one row |
102 |
- per unique splicing graph edge and one column of counts per sample. |
|
103 |
- Two additional columns (\code{"sgedge_id"} and \code{"ex_or_in"}) contain |
|
104 |
- the splicing graph edge ids and the type of edge (exon or intron), |
|
105 |
- respectively. |
|
102 |
+ per unique splicing graph edge if \code{by="sgedge"}, or one row per |
|
103 |
+ unique reduced splicing graph edge if \code{by="rsgedge"}. |
|
104 |
+ There is one column of counts per sample, and the following two |
|
105 |
+ additional columns: |
|
106 |
+ \enumerate{ |
|
107 |
+ \item \code{"sgedge_id"} if \code{by="sgedge"} or \code{"rsgedge_id"} |
|
108 |
+ if \code{by="rsgedge"}: contains the \emph{global splicing graph |
|
109 |
+ edge ids} if \code{by="sgedge"}, or the \emph{global reduced |
|
110 |
+ splicing graph edge ids} if \code{by="rsgedge"}. |
|
111 |
+ \item \code{"ex_or_in"}: the type of edge. This can be exon or intron |
|
112 |
+ if \code{by="sgedge"}, or exon, intron, or mixed if |
|
113 |
+ \code{by="rsgedge"}. |
|
114 |
+ } |
|
106 | 115 |
} |
107 | 116 |
|
108 | 117 |
\author{ |
... | ... |
@@ -119,6 +119,8 @@ edges_by_gene[["geneA"]] |
119 | 119 |
## "geneA:2,4" (intron), and "geneA:4,5" (exon), during the graph |
120 | 120 |
## reduction. |
121 | 121 |
|
122 |
+stopifnot(identical(edges_by_gene["geneB"], rsgedgesByGene(sg["geneB"]))) |
|
123 |
+ |
|
122 | 124 |
## --------------------------------------------------------------------- |
123 | 125 |
## 3. sgedgesByTranscript() |
124 | 126 |
## --------------------------------------------------------------------- |
... | ... |
@@ -4,11 +4,9 @@ |
4 | 4 |
|
5 | 5 |
\alias{sgedges} |
6 | 6 |
\alias{sgedges,SplicingGraphs-method} |
7 |
-\alias{sgedges,IntegerList-method} |
|
8 |
-\alias{sgedges,data.frame-method} |
|
9 | 7 |
|
10 | 8 |
\alias{sgnodes} |
11 |
-\alias{sgnodes,ANY-method} |
|
9 |
+\alias{sgnodes,SplicingGraphs-method} |
|
12 | 10 |
\alias{sgnodes,IntegerList-method} |
13 | 11 |
\alias{sgnodes,data.frame-method} |
14 | 12 |
\alias{sgnodes,DataFrame-method} |
... | ... |
@@ -94,7 +92,6 @@ check_way1_vs_way2 <- function(res1, res2) |
94 | 92 |
{ |
95 | 93 |
edges1 <- res1[res1$ex_or_in != "", ] # remove artificial edges |
96 | 94 |
edges2 <- mcols(unlist(res2, use.names=FALSE)) |
97 |
- edges2 <- edges2[ , -1] # remove "sgedge_id" col (global edge id) |
|
98 | 95 |
stopifnot(identical(edges1, edges2)) |
99 | 96 |
} |
100 | 97 |
|