Browse code

Add getReads(). Similar to countReads() but returns right before the final counting step, that is, the returned DataFrame contains the reads instead of their counts.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SplicingGraphs@76127 bc3139a8-67e5-0310-9ffc-ced21a209358

Herve Pages authored on 29/04/2013 18:02:30
Showing 5 changed files

... ...
@@ -102,6 +102,7 @@ export(
102 102
     bubbles,
103 103
 
104 104
     ## countReads-methods.R:
105
+    getReads,
105 106
     countReads
106 107
 )
107 108
 
... ...
@@ -121,6 +122,7 @@ exportMethods(
121 122
     rsgedgesByTranscript,
122 123
     rsgedgesByGene,
123 124
     bubbles,
125
+    getReads,
124 126
     countReads
125 127
 )
126 128
 
... ...
@@ -1,35 +1,37 @@
1 1
 ### =========================================================================
2
-### Summarizing the reads assigned to the edges of a SplicingGraphs object
2
+### Summarizing the reads assigned to a SplicingGraphs object
3 3
 ### -------------------------------------------------------------------------
4 4
 
5 5
 
6
-.countReads_by_sgedge <- function(x)
6
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
7
+### getReads()
8
+###
9
+
10
+.getReads_by_sgedge <- function(x)
7 11
 {
8 12
     edges_by_gene <- sgedgesByGene(x, with.hits.mcols=TRUE)
9 13
     edge_data <- mcols(unlist(edges_by_gene, use.names=FALSE))
10 14
     edge_data_colnames <- colnames(edge_data)
11 15
     hits_mcol_idx <- grep("\\.hits$", edge_data_colnames)
12
-    ans <- endoapply(edge_data[hits_mcol_idx], elementLengths)
13
-    colnames(ans) <- sub("\\.hits$", "", colnames(ans))
16
+    hits_cols <- edge_data[hits_mcol_idx]
14 17
     left_mcolnames <- c("sgedge_id", "ex_or_in")
15 18
     left_cols <- edge_data[left_mcolnames]
16
-    cbind(left_cols, ans)
19
+    cbind(left_cols, hits_cols)
17 20
 }
18 21
 
19
-.countReads_by_rsgedge <- function(x)
22
+.getReads_by_rsgedge <- function(x)
20 23
 {
21 24
     edges_by_gene <- rsgedgesByGene(x, with.hits.mcols=TRUE)
22 25
     edge_data <- mcols(unlist(edges_by_gene, use.names=FALSE))
23 26
     edge_data_colnames <- colnames(edge_data)
24 27
     hits_mcol_idx <- grep("\\.hits$", edge_data_colnames)
25
-    ans <- endoapply(edge_data[hits_mcol_idx], elementLengths)
26
-    colnames(ans) <- sub("\\.hits$", "", colnames(ans))
28
+    hits_cols <- edge_data[hits_mcol_idx]
27 29
     left_mcolnames <- c("rsgedge_id", "ex_or_in")
28 30
     left_cols <- edge_data[left_mcolnames]
29
-    cbind(left_cols, ans)
31
+    cbind(left_cols, hits_cols)
30 32
 }
31 33
 
32
-.countReads_by_tx <- function(x)
34
+.getReads_by_tx <- function(x)
33 35
 {
34 36
     ex_by_tx <- unlist(x)
35 37
     tx_id <- mcols(ex_by_tx)[ , "tx_id"]
... ...
@@ -44,20 +46,18 @@
44 46
     ## a function 'FUN' that modifies the nb of rows. Furthermore, the
45 47
     ## returned object passes validation despite being broken! Fix it
46 48
     ## in IRanges.
47
-    ans <- endoapply(edge_data[hits_mcol_idx],
48
-                     function(hits)
49
-                       elementLengths(unique(regroup(hits,
50
-                                                     edge_data_breakpoints))))
49
+    hits_cols <- endoapply(edge_data[hits_mcol_idx],
50
+                           function(hits)
51
+                             unique(regroup(hits, edge_data_breakpoints)))
51 52
 
52 53
     ## Fix the broken DataFrame returned by endoapply().
53
-    ans@nrows <- length(tx_id)
54
-    ans@rownames <- NULL
54
+    hits_cols@nrows <- length(tx_id)
55
+    hits_cols@rownames <- NULL
55 56
 
56
-    colnames(ans) <- sub("\\.hits$", "", colnames(ans))
57
-    cbind(DataFrame(tx_id=tx_id, gene_id=gene_id), ans)
57
+    cbind(DataFrame(tx_id=tx_id, gene_id=gene_id), hits_cols)
58 58
 }
59 59
 
60
-.countReads_by_gene <- function(x)
60
+.getReads_by_gene <- function(x)
61 61
 {
62 62
     edges_by_gene <- sgedgesByGene(x, with.hits.mcols=TRUE)
63 63
     edge_data <- mcols(unlist(edges_by_gene, use.names=FALSE))
... ...
@@ -69,21 +69,43 @@
69 69
     ## a function 'FUN' that modifies the nb of rows. Furthermore, the
70 70
     ## returned object passes validation despite being broken! Fix it
71 71
     ## in IRanges.
72
-    ans <- endoapply(edge_data[hits_mcol_idx],
72
+    hits_cols <- endoapply(edge_data[hits_mcol_idx],
73 73
                      function(hits)
74
-                       elementLengths(unique(regroup(hits,
75
-                                                     edge_data_breakpoints))))
74
+                       unique(regroup(hits, edge_data_breakpoints)))
76 75
 
77 76
     ## Fix the broken DataFrame returned by endoapply().
78
-    ans@nrows <- length(edges_by_gene)
79
-    ans@rownames <- NULL
77
+    hits_cols@nrows <- length(edges_by_gene)
78
+    hits_cols@rownames <- NULL
80 79
 
81
-    colnames(ans) <- sub("\\.hits$", "", colnames(ans))
82 80
     gene_id <- names(edges_by_gene)
83 81
     tx_id <- unique(regroup(edge_data[ , "tx_id"], edge_data_breakpoints))
84
-    cbind(DataFrame(gene_id=gene_id, tx_id=tx_id), ans)
82
+    cbind(DataFrame(gene_id=gene_id, tx_id=tx_id), hits_cols)
85 83
 }
86 84
 
85
+setGeneric("getReads", signature="x",
86
+    function(x, by=c("sgedge", "rsgedge", "tx", "gene"))
87
+        standardGeneric("getReads")
88
+)
89
+
90
+### Return a DataFrame with 1 row per splicing graph edge (or reduced
91
+### splicing graph edge), and 1 column per sample.
92
+setMethod("getReads", "SplicingGraphs",
93
+    function(x, by=c("sgedge", "rsgedge", "tx", "gene"))
94
+    {
95
+        by <- match.arg(by)
96
+        switch(by,
97
+            sgedge=.getReads_by_sgedge(x),
98
+            rsgedge=.getReads_by_rsgedge(x),
99
+            tx=.getReads_by_tx(x),
100
+            gene=.getReads_by_gene(x))
101
+    }
102
+)
103
+
104
+
105
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
106
+### countReads()
107
+###
108
+
87 109
 setGeneric("countReads", signature="x",
88 110
     function(x, by=c("sgedge", "rsgedge", "tx", "gene"))
89 111
         standardGeneric("countReads")
... ...
@@ -94,12 +116,13 @@ setGeneric("countReads", signature="x",
94 116
 setMethod("countReads", "SplicingGraphs",
95 117
     function(x, by=c("sgedge", "rsgedge", "tx", "gene"))
96 118
     {
97
-        by <- match.arg(by)
98
-        switch(by,
99
-            sgedge=.countReads_by_sgedge(x),
100
-            rsgedge=.countReads_by_rsgedge(x),
101
-            tx=.countReads_by_tx(x),
102
-            gene=.countReads_by_gene(x))
119
+        assigned_reads <- getReads(x, by=by)
120
+        hits_col_idx <- grep("\\.hits$", colnames(assigned_reads))
121
+        if (length(hits_col_idx) == 0L)
122
+            return(assigned_reads)
123
+        read_counts <- endoapply(assigned_reads[hits_col_idx], elementLengths)
124
+        colnames(read_counts) <- sub("\\.hits$", "", colnames(read_counts))
125
+        cbind(assigned_reads[-hits_col_idx], read_counts)
103 126
     }
104 127
 )
105 128
 
... ...
@@ -2,19 +2,10 @@ o Replace the RNA-seq data used in the vignette by data that actually
2 2
   contains junction reads. The BAM files in TBX20BamSubset don't contain
3 3
   any!
4 4
 
5
-o Move vignette to vignettes/
6
-
7
-o Implement rsgedgesByTranscript().
8
-
9 5
 o Complete vignette (get rid of all TODO from it).
10 6
 
11 7
 o Complete man pages (get rid of all TODO from them).
12 8
 
13
-o Fix issue with calls to plotting functions requiring 1 more key-stroke than
14
-  necessary before they actually start to plot something.
15
-
16
-o Add by="gene" to countReads() for counting of unique reads per gene.
17
-
18 9
 o Add 'drop.ambiguous.hits' arg to countReads() for dropping reads that are
19 10
   assigned to multiple edges (when by="sgedge"), to multiple reduced edges
20 11
   (when by="rsgedge"), to multiple transcripts (when by="tx"), or to
... ...
@@ -30,3 +21,11 @@ o Add 'drop.ambiguous.hits' arg to countReads() for dropping reads that are
30 21
   countReads(sg, by="tx", drop.ambiguous.hits=TRUE) and compare. Should
31 22
   produce the same result.
32 23
 
24
+o Fix issue with calls to plotting functions requiring 1 more key-stroke than
25
+  necessary before they actually start to plot something.
26
+
27
+o Implement rsgedgesByTranscript().
28
+
29
+o Move vignette to vignettes/
30
+
31
+
... ...
@@ -74,7 +74,7 @@
74 74
           SplicingGraphs object.
75 75
 
76 76
     \item \code{\link{countReads}} for summarizing the reads assigned to
77
-          the edges of a SplicingGraphs object.
77
+          a SplicingGraphs object.
78 78
 
79 79
     \item \code{\link{toy_genes_gff}} for details about the toy data included
80 80
           in this package.
... ...
@@ -7,15 +7,21 @@
7 7
 
8 8
 
9 9
 \title{
10
-  Summarize the reads assigned to the edges of a SplicingGraphs object
10
+  Summarize the reads assigned to a SplicingGraphs object
11 11
 }
12 12
 
13 13
 \description{
14
-  \code{countReads} returns a summarized count of the reads assigned to
15
-  the edges of a SplicingGraphs object.
14
+  \code{getReads} returns the reads assigned to a SplicingGraphs object,
15
+  summarized either by splicing graph edge, \emph{reduced} splicing graph
16
+  edge, transcript, or gene.
17
+
18
+  \code{countReads} counts the reads assigned to a SplicingGraphs object.
19
+  The counting can be done by splicing graph edge, \emph{reduced} splicing
20
+  graph edge, transcript, or gene.
16 21
 }
17 22
 
18 23
 \usage{
24
+getReads(x, by=c("sgedge", "rsgedge", "tx", "gene"))
19 25
 countReads(x, by=c("sgedge", "rsgedge", "tx", "gene"))
20 26
 }
21 27
 
... ...
@@ -24,9 +30,9 @@ countReads(x, by=c("sgedge", "rsgedge", "tx", "gene"))
24 30
     A \link{SplicingGraphs} object.
25 31
   }
26 32
   \item{by}{
27
-    Summarize by splicing graph edge (\code{by="sgedge"}), by \emph{reduced}
28
-    splicing graph edge (\code{by="rsgedge"}), by transcript (\code{by="tx"}),
29
-    or by gene (\code{by="gene"}).
33
+    Summarize/count by splicing graph edge (\code{by="sgedge"}), by
34
+    \emph{reduced} splicing graph edge (\code{by="rsgedge"}), by transcript
35
+    (\code{by="tx"}), or by gene (\code{by="gene"}).
30 36
   }
31 37
 }
32 38
 
... ...
@@ -43,8 +49,9 @@ countReads(x, by=c("sgedge", "rsgedge", "tx", "gene"))
43 49
     \item gene if \code{by="gene"}.
44 50
   }
45 51
 
46
-  And with one column per sample (containing the counts for that sample),
47
-  plus the two following additional leading columns:
52
+  And with one column per sample (containing the reads for that sample for
53
+  \code{getReads}, and the counts for that sample for \code{countReads}),
54
+  plus the following two additional leading columns:
48 55
   \itemize{
49 56
     \item if \code{by="sgedge"}: \code{"sgedge_id"}, containing the
50 57
           \emph{global splicing graph edge ids}, and \code{"ex_or_in"},
... ...
@@ -76,16 +83,32 @@ countReads(x, by=c("sgedge", "rsgedge", "tx", "gene"))
76 83
 example(assignReads)
77 84
 
78 85
 ## ---------------------------------------------------------------------
79
-## 2. Summarize the reads assigned to 'sg'
86
+## 2. Summarize the reads by splicing graph edge
87
+## ---------------------------------------------------------------------
88
+getReads(sg) 
89
+countReads(sg)
90
+
91
+## ---------------------------------------------------------------------
92
+## 3. Summarize the reads by reduced splicing graph edge
93
+## ---------------------------------------------------------------------
94
+getReads(sg, by="rsgedge")
95
+countReads(sg, by="rsgedge")
96
+
97
+## ---------------------------------------------------------------------
98
+## 4. Summarize the reads by transcript
99
+## ---------------------------------------------------------------------
100
+getReads(sg, by="tx")
101
+countReads(sg, by="tx")
102
+
103
+## ---------------------------------------------------------------------
104
+## 4. Summarize the reads by gene
80 105
 ## ---------------------------------------------------------------------
81
-countReads(sg)  # nb of reads per splicing graph edge
82
-countReads(sg, by="rsgedge")  # ... per reduced splicing graph edge
83
-countReads(sg, by="tx")  # ... per transcript
84
-countReads(sg, by="gene")  # ... per gene
106
+getReads(sg, by="gene")
107
+countReads(sg, by="gene")
85 108
 
86 109
 ## ---------------------------------------------------------------------
87
-## 3. Remove the reads from 'sg'.
110
+## 5. Remove the reads from 'sg'.
88 111
 ## ---------------------------------------------------------------------
89
-removeReads(sg)
112
+sg <- removeReads(sg)
90 113
 countReads(sg)
91 114
 }