12 | 13 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,54 @@ |
1 |
+#' Select transcript models |
|
2 |
+#' |
|
3 |
+#' @param fin_gtf Character of an input GTF file that contains |
|
4 |
+#' transcript models. Required to have 'transcript_id' in the |
|
5 |
+#' attribute column (column 9) |
|
6 |
+#' |
|
7 |
+#' @param fout_gtf Character of an output GTF file that contains selected |
|
8 |
+#' transcript models |
|
9 |
+#' |
|
10 |
+#' @param min_n_exon Minimium number of exons a transcript model required to |
|
11 |
+#' have |
|
12 |
+#' Default: 2 |
|
13 |
+#' |
|
14 |
+#' @param min_tr_len Minimium length (bp) of exon(s) and intron(s) a |
|
15 |
+#' transcript model required to have |
|
16 |
+#' Default: 200 |
|
17 |
+#' |
|
18 |
+#' @param info_keys A vector of characters defining the attributes in input |
|
19 |
+#' GTF file's column 9 to be saved in the output GTF file. |
|
20 |
+#' 'transcript_id' will always be saved. |
|
21 |
+#' Default: c( 'transcript_id' ) |
|
22 |
+#' |
|
23 |
+#' @export |
|
24 |
+#' |
|
25 |
+setGeneric('selModel', |
|
26 |
+ function(fin_gtf, fout_gtf, min_n_exon, min_tr_len, info_keys) |
|
27 |
+ standardGeneric('selModel')) |
|
28 |
+ |
|
29 |
+setMethod( |
|
30 |
+'selModel', |
|
31 |
+c('character', 'character', 'numeric', 'numeric', 'vector'), |
|
32 |
+function(fin_gtf, fout_gtf, min_n_exon=2, min_tr_len=200, |
|
33 |
+ info_keys = c('transcript_id') ) { |
|
34 |
+ in_gtf = new('GTF') |
|
35 |
+ out_gtf = new('GTF') |
|
36 |
+ out_infokeys = unique(c('transcript_id', info_keys)) |
|
37 |
+ |
|
38 |
+ in_gtf = initFromGTFFile(in_gtf, fin_gtf, infokeys=out_infokeys) |
|
39 |
+ grdt = grangedt(in_gtf) |
|
40 |
+ |
|
41 |
+ exondt = grdt[ feature == 'exon' ] |
|
42 |
+ dt = exondt[, list( n_exon = .N, |
|
43 |
+ tr_len = max(end) - min(start) ), by=transcript_id] |
|
44 |
+ |
|
45 |
+ sel_trids = dt[ ( n_exon >= min_n_exon ) & |
|
46 |
+ ( tr_len >= min_tr_len ) ]$transcript_id |
|
47 |
+ sel_grdt = grdt[ transcript_id %in% sel_trids ] |
|
48 |
+ |
|
49 |
+ origin(out_gtf) = origin(in_gtf) |
|
50 |
+ infokeys(out_gtf) = out_infokeys |
|
51 |
+ grangedt(out_gtf) = sel_grdt |
|
52 |
+ |
|
53 |
+ writeGTF(out_gtf, fout_gtf, append=F) |
|
54 |
+}) |
0 | 55 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,12 @@ |
1 |
+##hg38 v24 |
|
2 |
+chr1 UNKNOWN exon 100 200 . + . transcript_id "ENSG0.6"; gene_id "ENSG.6"; exon_number "1"; |
|
3 |
+chr9 UNKNOWN gene 100 900 . - . transcript_id "ENSG0.0"; gene_id "ENSG.0"; exon_number "1"; |
|
4 |
+chr10 UNKNOWN exon 100 150 . - . transcript_id "ENSG0.1"; gene_id "ENSG.1"; exon_number "1"; |
|
5 |
+chr10 UNKNOWN exon 180 200 . - . transcript_id "ENSG0.1"; gene_id "ENSG.1"; exon_number "2"; |
|
6 |
+chr10 UNKNOWN exon 800 900 . + . transcript_id "ENSG0.2"; gene_id "ENSG.2"; exon_number "1"; |
|
7 |
+chr11 UNKNOWN exon 100 200 . + . transcript_id "ENSG0.3"; gene_id "ENSG.3"; exon_number "1"; |
|
8 |
+chr11 UNKNOWN exon 500 600 . - . transcript_id "ENSG0.4"; gene_id "ENSG.4"; exon_number "1"; |
|
9 |
+chr11 UNKNOWN exon 800 900 . - . transcript_id "ENSG0.4"; gene_id "ENSG.4"; exon_number "2"; |
|
10 |
+chr12 UNKNOWN exon 700 750 . + . transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "1"; |
|
11 |
+chr12 UNKNOWN exon 800 900 . + . transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "2"; |
|
12 |
+chr12 UNKNOWN transcript 700 900 . + . transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "1"; |
0 | 13 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,5 @@ |
1 |
+chr11 UNKNOWN exon 500 600 . - . transcript_id "ENSG0.4"; gene_id "ENSG.4"; |
|
2 |
+chr11 UNKNOWN exon 800 900 . - . transcript_id "ENSG0.4"; gene_id "ENSG.4"; |
|
3 |
+chr12 UNKNOWN exon 700 750 . + . transcript_id "ENSG0.5"; gene_id "ENSG.5"; |
|
4 |
+chr12 UNKNOWN exon 800 900 . + . transcript_id "ENSG0.5"; gene_id "ENSG.5"; |
|
5 |
+chr12 UNKNOWN transcript 700 900 . + . transcript_id "ENSG0.5"; gene_id "ENSG.5"; |
18 | 19 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,32 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/selModel.R |
|
3 |
+\name{selModel} |
|
4 |
+\alias{selModel} |
|
5 |
+\title{Select transcript models} |
|
6 |
+\usage{ |
|
7 |
+selModel(fin_gtf, fout_gtf, min_n_exon, min_tr_len, info_keys) |
|
8 |
+} |
|
9 |
+\arguments{ |
|
10 |
+\item{fin_gtf}{Character of an input GTF file that contains |
|
11 |
+transcript models. Required to have 'transcript_id' in the |
|
12 |
+attribute column (column 9)} |
|
13 |
+ |
|
14 |
+\item{fout_gtf}{Character of an output GTF file that contains selected |
|
15 |
+transcript models} |
|
16 |
+ |
|
17 |
+\item{min_n_exon}{Minimium number of exons a transcript model required to |
|
18 |
+have |
|
19 |
+Default: 2} |
|
20 |
+ |
|
21 |
+\item{min_tr_len}{Minimium length (bp) of exon(s) and intron(s) a |
|
22 |
+transcript model required to have |
|
23 |
+Default: 200} |
|
24 |
+ |
|
25 |
+\item{info_keys}{A vector of characters defining the attributes in input |
|
26 |
+GTF file's column 9 to be saved in the output GTF file. |
|
27 |
+'transcript_id' will always be saved. |
|
28 |
+Default: c( 'transcript_id' )} |
|
29 |
+} |
|
30 |
+\description{ |
|
31 |
+Select transcript models |
|
32 |
+} |
0 | 33 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,18 @@ |
1 |
+main <- function() { |
|
2 |
+ context('selModel') |
|
3 |
+ |
|
4 |
+ fin_gtf = system.file('extdata/gtf/selModel_in.gtf', package='pram') |
|
5 |
+ fcmp_gtf = system.file('extdata/gtf/selModel_out.gtf', package='pram') |
|
6 |
+ fout_gtf = paste0(tempdir(), '/pram_selModel_out.gtf') |
|
7 |
+ |
|
8 |
+ selModel(fin_gtf, fout_gtf, min_n_exon=2, min_tr_len=200, |
|
9 |
+ info_keys=c('transcript_id', 'gene_id')) |
|
10 |
+ |
|
11 |
+ cmp_lines = readLines(fcmp_gtf) |
|
12 |
+ out_lines = readLines(fout_gtf) |
|
13 |
+ |
|
14 |
+ test_that(paste0(fout_gtf, ' vs ', fcmp_gtf), |
|
15 |
+ expect_identical(cmp_lines, out_lines)) |
|
16 |
+} |
|
17 |
+ |
|
18 |
+main() |