Browse code

add a function and testing script to select transcript models by # of exons and transcript length

Peng Liu authored on 11/02/2018 22:18:36
Showing 8 changed files

... ...
@@ -23,4 +23,5 @@ Collate:
23 23
     'evalModel.R'
24 24
     'prepIgBam.R'
25 25
     'screenModel.R'
26
+    'selModel.R'
26 27
     'util.R'
... ...
@@ -6,6 +6,7 @@ export(evalModel)
6 6
 export(prepIgBam)
7 7
 export(readGTF)
8 8
 export(screenModel)
9
+export(selModel)
9 10
 export(trainModelClassifier)
10 11
 import(data.table)
11 12
 import(methods)
12 13
new file mode 100644
... ...
@@ -0,0 +1,54 @@
1
+#' Select transcript models
2
+#'
3
+#' @param  fin_gtf  Character of an input GTF file that contains
4
+#'                  transcript models. Required to have 'transcript_id' in the
5
+#'                  attribute column (column 9)
6
+#'
7
+#' @param  fout_gtf  Character of an output GTF file that contains selected
8
+#'                   transcript models
9
+#'
10
+#' @param  min_n_exon  Minimium number of exons a transcript model required to
11
+#'                     have
12
+#'                     Default: 2
13
+#'
14
+#' @param  min_tr_len  Minimium length (bp) of exon(s) and intron(s) a
15
+#'                     transcript model required to have
16
+#'                     Default: 200
17
+#'
18
+#' @param  info_keys  A vector of characters defining the attributes in input
19
+#'                    GTF file's column 9 to be saved in the output GTF file.
20
+#'                    'transcript_id' will always be saved.
21
+#'                    Default: c( 'transcript_id' )
22
+#'
23
+#' @export
24
+#'
25
+setGeneric('selModel',
26
+           function(fin_gtf, fout_gtf, min_n_exon, min_tr_len, info_keys)
27
+           standardGeneric('selModel'))
28
+
29
+setMethod(
30
+'selModel',
31
+c('character', 'character', 'numeric', 'numeric', 'vector'),
32
+function(fin_gtf, fout_gtf, min_n_exon=2, min_tr_len=200,
33
+         info_keys = c('transcript_id') ) {
34
+    in_gtf  = new('GTF')
35
+    out_gtf = new('GTF')
36
+    out_infokeys = unique(c('transcript_id', info_keys))
37
+
38
+    in_gtf = initFromGTFFile(in_gtf, fin_gtf, infokeys=out_infokeys)
39
+    grdt = grangedt(in_gtf)
40
+
41
+    exondt = grdt[ feature == 'exon' ]
42
+    dt = exondt[, list( n_exon = .N,
43
+                        tr_len = max(end) - min(start) ), by=transcript_id]
44
+
45
+    sel_trids = dt[ ( n_exon >= min_n_exon  ) &
46
+                    ( tr_len >= min_tr_len ) ]$transcript_id
47
+    sel_grdt = grdt[ transcript_id %in% sel_trids ]
48
+
49
+    origin(out_gtf)   = origin(in_gtf)
50
+    infokeys(out_gtf) = out_infokeys
51
+    grangedt(out_gtf) = sel_grdt
52
+
53
+    writeGTF(out_gtf, fout_gtf, append=F)
54
+})
0 55
new file mode 100644
... ...
@@ -0,0 +1,12 @@
1
+##hg38 v24 
2
+chr1	UNKNOWN	exon	100	200	.	+	.	transcript_id "ENSG0.6"; gene_id "ENSG.6"; exon_number "1";
3
+chr9	UNKNOWN	gene	100	900	.	-	.	transcript_id "ENSG0.0"; gene_id "ENSG.0"; exon_number "1";
4
+chr10	UNKNOWN	exon	100	150	.	-	.	transcript_id "ENSG0.1"; gene_id "ENSG.1"; exon_number "1";
5
+chr10	UNKNOWN	exon	180	200	.	-	.	transcript_id "ENSG0.1"; gene_id "ENSG.1"; exon_number "2";
6
+chr10	UNKNOWN	exon	800	900	.	+	.	transcript_id "ENSG0.2"; gene_id "ENSG.2"; exon_number "1";
7
+chr11	UNKNOWN	exon	100	200	.	+	.	transcript_id "ENSG0.3"; gene_id "ENSG.3"; exon_number "1";
8
+chr11	UNKNOWN	exon	500	600	.	-	.	transcript_id "ENSG0.4"; gene_id "ENSG.4"; exon_number "1";
9
+chr11	UNKNOWN	exon	800	900	.	-	.	transcript_id "ENSG0.4"; gene_id "ENSG.4"; exon_number "2";
10
+chr12	UNKNOWN	exon	700	750	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "1";
11
+chr12	UNKNOWN	exon	800	900	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "2";
12
+chr12	UNKNOWN	transcript	700	900	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "1";
0 13
new file mode 100644
... ...
@@ -0,0 +1,5 @@
1
+chr11	UNKNOWN	exon	500	600	.	-	.	transcript_id "ENSG0.4"; gene_id "ENSG.4";
2
+chr11	UNKNOWN	exon	800	900	.	-	.	transcript_id "ENSG0.4"; gene_id "ENSG.4";
3
+chr12	UNKNOWN	exon	700	750	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5";
4
+chr12	UNKNOWN	exon	800	900	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5";
5
+chr12	UNKNOWN	transcript	700	900	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5";
... ...
@@ -14,4 +14,5 @@ install(quick=T, reload=F, threads=4)
14 14
 # test( filter = 'defIgRanges' )
15 15
 # test( filter = 'buildModel' )
16 16
 # test( filter = 'evalModel' )
17
-  test()
17
+# test( filter = 'selModel' )
18
+ test()
18 19
new file mode 100644
... ...
@@ -0,0 +1,32 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/selModel.R
3
+\name{selModel}
4
+\alias{selModel}
5
+\title{Select transcript models}
6
+\usage{
7
+selModel(fin_gtf, fout_gtf, min_n_exon, min_tr_len, info_keys)
8
+}
9
+\arguments{
10
+\item{fin_gtf}{Character of an input GTF file that contains
11
+transcript models. Required to have 'transcript_id' in the
12
+attribute column (column 9)}
13
+
14
+\item{fout_gtf}{Character of an output GTF file that contains selected
15
+transcript models}
16
+
17
+\item{min_n_exon}{Minimium number of exons a transcript model required to
18
+have
19
+Default: 2}
20
+
21
+\item{min_tr_len}{Minimium length (bp) of exon(s) and intron(s) a
22
+transcript model required to have
23
+Default: 200}
24
+
25
+\item{info_keys}{A vector of characters defining the attributes in input
26
+GTF file's column 9 to be saved in the output GTF file.
27
+'transcript_id' will always be saved.
28
+Default: c( 'transcript_id' )}
29
+}
30
+\description{
31
+Select transcript models
32
+}
0 33
new file mode 100644
... ...
@@ -0,0 +1,18 @@
1
+main <- function() {
2
+    context('selModel')
3
+
4
+    fin_gtf  = system.file('extdata/gtf/selModel_in.gtf',  package='pram')
5
+    fcmp_gtf = system.file('extdata/gtf/selModel_out.gtf', package='pram')
6
+    fout_gtf = paste0(tempdir(), '/pram_selModel_out.gtf')
7
+
8
+    selModel(fin_gtf, fout_gtf, min_n_exon=2, min_tr_len=200,
9
+             info_keys=c('transcript_id', 'gene_id'))
10
+
11
+    cmp_lines = readLines(fcmp_gtf)
12
+    out_lines = readLines(fout_gtf)
13
+
14
+    test_that(paste0(fout_gtf, ' vs ', fcmp_gtf),
15
+              expect_identical(cmp_lines, out_lines))
16
+}
17
+
18
+main()