Bioconductor Code: pram

Browse code

add a function and testing script to select transcript models by # of exons and transcript length

Peng Liu authored on 11/02/2018 22:18:36
Showing 8 changed files

DESCRIPTION index 8efcdda..2026db1 100644
NAMESPACE index debf2a0..5cbabb6 100644
R/selModel.R index 0000000..0eed799
inst/extdata/gtf/selModel_in.gtf index 0000000..5ce5550
inst/extdata/gtf/selModel_out.gtf index 0000000..9d257bc
job.R index 5f04c9d..25939d7 100755
man/selModel.Rd index 0000000..666d59d
tests/testthat/test-selModel.R index 0000000..99c3f47

History View file @ aea4be5

@@ -23,4 +23,5 @@ Collate:
                          'evalModel.R'
                          'prepIgBam.R'
                          'screenModel.R'
                     +    'selModel.R'
                          'util.R'

NAMESPACE

History View file @ aea4be5

@@ -6,6 +6,7 @@ export(evalModel)
                      export(prepIgBam)
                      export(readGTF)
                      export(screenModel)
                     +export(selModel)
                      export(trainModelClassifier)
                      import(data.table)
                      import(methods)

R/selModel.R

History View file @ aea4be5

                     new file mode 100644
@@ -0,0 +1,54 @@
                     +#' Select transcript models
                     +#'
                     +#' @param  fin_gtf  Character of an input GTF file that contains
                     +#'                  transcript models. Required to have 'transcript_id' in the
                     +#'                  attribute column (column 9)
                     +#'
                     +#' @param  fout_gtf  Character of an output GTF file that contains selected
                     +#'                   transcript models
                     +#'
                     +#' @param  min_n_exon  Minimium number of exons a transcript model required to
                     +#'                     have
                     +#'                     Default: 2
                     +#'
                     +#' @param  min_tr_len  Minimium length (bp) of exon(s) and intron(s) a
                     +#'                     transcript model required to have
                     +#'                     Default: 200
                     +#'
                     +#' @param  info_keys  A vector of characters defining the attributes in input
                     +#'                    GTF file's column 9 to be saved in the output GTF file.
                     +#'                    'transcript_id' will always be saved.
                     +#'                    Default: c( 'transcript_id' )
                     +#'
                     +#' @export
                     +#'
                     +setGeneric('selModel',
                     +           function(fin_gtf, fout_gtf, min_n_exon, min_tr_len, info_keys)
                     +           standardGeneric('selModel'))
+                    +
                     +setMethod(
                     +'selModel',
                     +c('character', 'character', 'numeric', 'numeric', 'vector'),
                     +function(fin_gtf, fout_gtf, min_n_exon=2, min_tr_len=200,
                     +         info_keys = c('transcript_id') ) {
                     +    in_gtf  = new('GTF')
                     +    out_gtf = new('GTF')
                     +    out_infokeys = unique(c('transcript_id', info_keys))
+                    +
                     +    in_gtf = initFromGTFFile(in_gtf, fin_gtf, infokeys=out_infokeys)
                     +    grdt = grangedt(in_gtf)
+                    +
                     +    exondt = grdt[ feature == 'exon' ]
                     +    dt = exondt[, list( n_exon = .N,
                     +                        tr_len = max(end) - min(start) ), by=transcript_id]
+                    +
                     +    sel_trids = dt[ ( n_exon >= min_n_exon  ) &
                     +                    ( tr_len >= min_tr_len ) ]$transcript_id
                     +    sel_grdt = grdt[ transcript_id %in% sel_trids ]
+                    +
                     +    origin(out_gtf)   = origin(in_gtf)
                     +    infokeys(out_gtf) = out_infokeys
                     +    grangedt(out_gtf) = sel_grdt
+                    +
                     +    writeGTF(out_gtf, fout_gtf, append=F)
                     +})

inst/extdata/gtf/selModel_in.gtf

History View file @ aea4be5

                     new file mode 100644
@@ -0,0 +1,12 @@
                     +##hg38 v24
                     +chr1	UNKNOWN	exon	100	200	.	+	.	transcript_id "ENSG0.6"; gene_id "ENSG.6"; exon_number "1";
                     +chr9	UNKNOWN	gene	100	900	.	-	.	transcript_id "ENSG0.0"; gene_id "ENSG.0"; exon_number "1";
                     +chr10	UNKNOWN	exon	100	150	.	-	.	transcript_id "ENSG0.1"; gene_id "ENSG.1"; exon_number "1";
                     +chr10	UNKNOWN	exon	180	200	.	-	.	transcript_id "ENSG0.1"; gene_id "ENSG.1"; exon_number "2";
                     +chr10	UNKNOWN	exon	800	900	.	+	.	transcript_id "ENSG0.2"; gene_id "ENSG.2"; exon_number "1";
                     +chr11	UNKNOWN	exon	100	200	.	+	.	transcript_id "ENSG0.3"; gene_id "ENSG.3"; exon_number "1";
                     +chr11	UNKNOWN	exon	500	600	.	-	.	transcript_id "ENSG0.4"; gene_id "ENSG.4"; exon_number "1";
                     +chr11	UNKNOWN	exon	800	900	.	-	.	transcript_id "ENSG0.4"; gene_id "ENSG.4"; exon_number "2";
                     +chr12	UNKNOWN	exon	700	750	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "1";
                     +chr12	UNKNOWN	exon	800	900	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "2";
                     +chr12	UNKNOWN	transcript	700	900	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5"; exon_number "1";

inst/extdata/gtf/selModel_out.gtf

History View file @ aea4be5

                     new file mode 100644
@@ -0,0 +1,5 @@
                     +chr11	UNKNOWN	exon	500	600	.	-	.	transcript_id "ENSG0.4"; gene_id "ENSG.4";
                     +chr11	UNKNOWN	exon	800	900	.	-	.	transcript_id "ENSG0.4"; gene_id "ENSG.4";
                     +chr12	UNKNOWN	exon	700	750	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5";
                     +chr12	UNKNOWN	exon	800	900	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5";
                     +chr12	UNKNOWN	transcript	700	900	.	+	.	transcript_id "ENSG0.5"; gene_id "ENSG.5";

job.R

History View file @ aea4be5

@@ -14,4 +14,5 @@ install(quick=T, reload=F, threads=4)
                      # test( filter = 'defIgRanges' )
                      # test( filter = 'buildModel' )
                      # test( filter = 'evalModel' )
                     -  test()
                     +# test( filter = 'selModel' )
                     + test()

man/selModel.Rd

History View file @ aea4be5

                     new file mode 100644
@@ -0,0 +1,32 @@
                     +% Generated by roxygen2: do not edit by hand
                     +% Please edit documentation in R/selModel.R
                     +\name{selModel}
                     +\alias{selModel}
                     +\title{Select transcript models}
                     +\usage{
                     +selModel(fin_gtf, fout_gtf, min_n_exon, min_tr_len, info_keys)
                     +}
                     +\arguments{
                     +\item{fin_gtf}{Character of an input GTF file that contains
                     +transcript models. Required to have 'transcript_id' in the
                     +attribute column (column 9)}
+                    +
                     +\item{fout_gtf}{Character of an output GTF file that contains selected
                     +transcript models}
+                    +
                     +\item{min_n_exon}{Minimium number of exons a transcript model required to
                     +have
                     +Default: 2}
+                    +
                     +\item{min_tr_len}{Minimium length (bp) of exon(s) and intron(s) a
                     +transcript model required to have
                     +Default: 200}
+                    +
                     +\item{info_keys}{A vector of characters defining the attributes in input
                     +GTF file's column 9 to be saved in the output GTF file.
                     +'transcript_id' will always be saved.
                     +Default: c( 'transcript_id' )}
                     +}
                     +\description{
                     +Select transcript models
                     +}

tests/testthat/test-selModel.R

History View file @ aea4be5

                     new file mode 100644
@@ -0,0 +1,18 @@
                     +main <- function() {
                     +    context('selModel')
+                    +
                     +    fin_gtf  = system.file('extdata/gtf/selModel_in.gtf',  package='pram')
                     +    fcmp_gtf = system.file('extdata/gtf/selModel_out.gtf', package='pram')
                     +    fout_gtf = paste0(tempdir(), '/pram_selModel_out.gtf')
+                    +
                     +    selModel(fin_gtf, fout_gtf, min_n_exon=2, min_tr_len=200,
                     +             info_keys=c('transcript_id', 'gene_id'))
+                    +
                     +    cmp_lines = readLines(fcmp_gtf)
                     +    out_lines = readLines(fout_gtf)
+                    +
                     +    test_that(paste0(fout_gtf, ' vs ', fcmp_gtf),
                     +              expect_identical(cmp_lines, out_lines))
                     +}
+                    +
                     +main()