Browse code

move H5 class and methods to separate file

LiNk-NY authored on 02/06/2022 14:35:03
Showing 8 changed files

... ...
@@ -36,4 +36,5 @@ Collate:
36 36
     'TENxFile-class.R'
37 37
     'TENxFileList-class.R'
38 38
     'TENxFile-methods.R'
39
+    'TENxH5-class.R'
39 40
     'utils.R'
... ...
@@ -23,13 +23,6 @@
23 23
 
24 24
 S4Vectors::setValidity2("TENxFile", .validTENxFile)
25 25
 
26
-#' @exportClass TENxH5
27
-.TENxH5 <- setClass(
28
-    Class = "TENxH5",
29
-    contains = "TENxFile",
30
-    slots = c(version = "character", group = "character")
31
-)
32
-
33 26
 .TENxMTX <- setClass(
34 27
     Class = "TENxMTX",
35 28
     contains = "TENxFile"
... ...
@@ -70,36 +63,3 @@ TENxFile <- function(resource, ...) {
70 63
     TENxFUN(resource = resource,  extension = ext, ...)
71 64
 }
72 65
 
73
-.get_h5_group <- function(fpath) {
74
-    if (!requireNamespace("rhdf5", quietly = TRUE))
75
-        stop("Install 'rhdf5' to work with TENxH5")
76
-    l1 <- rhdf5::h5ls(fpath, recursive = FALSE)
77
-    l1[l1$otype == "H5I_GROUP", "name"]
78
-}
79
-
80
-.KNOWN_H5_GROUPS <- c("matrix", "outs")
81
-
82
-.check_group_ok <- function(fpath) {
83
-    gname <- .get_h5_group(fpath)
84
-    if (!gname %in% .KNOWN_H5_GROUPS)
85
-        stop("'group' not recognized")
86
-    gname
87
-}
88
-
89
-#' @rdname TENxH5
90
-#' @title Import H5 files from 10X
91
-#'
92
-#' @examples
93
-#'
94
-#' h5f <- "~/data/10x/pbmc_3k/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5"
95
-#' con <- TENxFile(h5f)
96
-#' import(con)
97
-#'
98
-#' @export
99
-TENxH5 <-
100
-    function(resource, version = c("3", "2"), ...)
101
-{
102
-    version <- match.arg(version)
103
-    group <- .check_group_ok(resource)
104
-    .TENxH5(resource = resource, group = group, version = version, ...)
105
-}
... ...
@@ -1,215 +1,6 @@
1
-#' @importFrom MatrixGenerics rowRanges
2 1
 #' @include TENxFileList-class.R
3 2
 NULL
4 3
 
5
-gene.meta <- data.frame(
6
-    version = c("3", "2"),
7
-    ID = c("/features/id", "/genes"),
8
-    Symbol = c("/features/name", "/gene_names"),
9
-    Type = c("/features/feature_type", NA_character_)
10
-)
11
-
12
-.getRowDat <- function(con) {
13
-    gm <- gene.meta[
14
-        con@version == gene.meta[["version"]],
15
-        !names(gene.meta) %in% c("group", "version")
16
-    ]
17
-    gm[] <- Filter(Negate(is.na), gm)
18
-    res <- lapply(gm, function(colval) {
19
-        readname <- paste0(con@group, colval)
20
-        rhdf5::h5read(path(con), readname)
21
-    })
22
-    as.data.frame(res)
23
-}
24
-
25
-.getGenome <- function(con) {
26
-    gloc <- "matrix/features/genome"
27
-    gen <- unique(rhdf5::h5read(path(con), gloc))
28
-    if (length(gen) != 1L)
29
-        stop("The genome build in ", gloc, " is not consistent")
30
-    gen
31
-}
32
-
33
-#' @importFrom GenomeInfoDb genome genome<-
34
-#' @importFrom S4Vectors mcols<-
35
-.getRowRanges <- function(con) {
36
-    interval <- rhdf5::h5read(path(con), "matrix/features/interval")
37
-    interval[interval == "NA"] <- "NA_character_:0"
38
-    gr <- as(as.character(interval), "GRanges")
39
-    mcols(gr) <- .getRowDat(con)
40
-    genbuild <- rep(.getGenome(con), length(genome(gr)))
41
-    genome(gr) <- genbuild
42
-    gr
43
-}
44
-
45
-#' @import SingleCellExperiment
46
-#' @export
47
-setMethod("import", "TENxH5", function(con, format, text, ...) {
48
-    if (!requireNamespace("HDF5Array", quietly = TRUE))
49
-        stop("Install 'HDF5Array' to import TENxH5 files")
50
-    matrixdata <- HDF5Array::TENxMatrix(path(con), con@group)
51
-    if (identical(con@version, "3")) {
52
-        sce <- SingleCellExperiment::SingleCellExperiment(
53
-            assays = list(counts = matrixdata), rowRanges = .getRowRanges(con)
54
-        )
55
-        rownames(sce) <- mcols(sce)[["ID"]]
56
-        splitAltExps(sce, rowData(sce)[["Type"]], ref = "Gene Expression")
57
-    } else {
58
-        stop("Version 2 not supported yet.")
59
-    }
60
-})
61
-
62
-#' @import SummarizedExperiment
63
-#' @export
64
-setMethod("import", "TENxMTX", function(con, format, text, ...) {
65
-    mtxf <- SingleCellMultiModal:::.read_mtx(path(con))
66
-    ## TODO: make use of other files
67
-    SummarizedExperiment::SummarizedExperiment(
68
-        assays = SimpleList(counts = mtxf)
69
-    )
70
-})
71
-
72
-
73
-#' @importFrom utils untar tail
74
-.TENxUntar <- function(con) {
75
-    dir.create(tempdir <- tempfile())
76
-    untar(path(con), exdir = tempdir)
77
-    tempdir
78
-}
79
-
80
-.readInFuns <- function(files) {
81
-    file_exts <- .get_ext(files)
82
-    lapply(.setNames(file_exts, basename(files)), function(ext) {
83
-        switch(
84
-            ext,
85
-            mtx.gz = Matrix::readMM,
86
-            tsv.gz = function(...)
87
-                readr::read_tsv(col_names = FALSE, show_col_types = FALSE, ...)
88
-        )
89
-    })
90
-}
91
-
92
-.cleanUpFuns <- function(datalist) {
93
-    if (is.null(names(datalist)))
94
-        stop("'datalist' names must correspond to originating file names")
95
-    lapply(.setNames(nm = names(datalist)), function(fname) {
96
-        switch(
97
-            fname,
98
-            features.tsv.gz = function(df) {
99
-                names(df) <- c("ID", "Symbol", "Type", "Chr", "Start", "End")
100
-                df
101
-            },
102
-            barcodes.tsv.gz = function(df) {
103
-                names(df) <- "barcode"
104
-                df
105
-            },
106
-            matrix.mtx.gz = function(mat) {
107
-                as(mat, "dgCMatrix")
108
-            },
109
-        )
110
-    })
111
-}
112
-
113
-.TENxDecompress <- function(con) {
114
-    res_ext <- .get_ext(path(con))
115
-    if (identical(res_ext, "tar.gz")) {
116
-        tenfolder <- .TENxUntar(con)
117
-        gfolder <- list.files(tenfolder, full.names = TRUE)
118
-        if (file.info(gfolder)$isdir)
119
-            gfiles <- list.files(gfolder, recursive = TRUE, full.names = TRUE)
120
-        else
121
-            gfiles <- gfolder
122
-        gdata <- Map(
123
-            f = function(reader, x) {
124
-                reader(x)
125
-            }, reader = .readInFuns(gfiles), x = gfiles
126
-        )
127
-        Map(f = function(cleaner, x) {
128
-                cleaner(x)
129
-            }, cleaner = .cleanUpFuns(gdata), x = gdata
130
-        )
131
-    } else {
132
-        stop("Extension type: ", res_ext, " not supported")
133
-    }
134
-}
135
-
136
-#' @export
137
-setMethod("import", "TENxFileList", function(con, format, text, ...) {
138
-    if (con@compressed)
139
-        fdata <- .TENxDecompress(con)
140
-    else
141
-        fdata <- con@listData
142
-    mat <- fdata[["matrix.mtx.gz"]]
143
-    colnames(mat) <- unlist(fdata[["barcodes.tsv.gz"]])
144
-    warning("Matrix of mixed types; see in rowData(x)")
145
-    SingleCellExperiment::SingleCellExperiment(
146
-        SimpleList(counts = mat),
147
-        rowData = fdata[["features.tsv.gz"]]
148
-    )
149
-})
150
-
151
-#' @importFrom MatrixGenerics rowRanges
152
-#' @include TENxFileList-class.R
153
-NULL
154
-
155
-gene.meta <- data.frame(
156
-    version = c("3", "2"),
157
-    ID = c("/features/id", "/genes"),
158
-    Symbol = c("/features/name", "/gene_names"),
159
-    Type = c("/features/feature_type", NA_character_)
160
-)
161
-
162
-.getRowDat <- function(con) {
163
-    gm <- gene.meta[
164
-        con@version == gene.meta[["version"]],
165
-        !names(gene.meta) %in% c("group", "version")
166
-    ]
167
-    gm[] <- Filter(Negate(is.na), gm)
168
-    res <- lapply(gm, function(colval) {
169
-        readname <- paste0(con@group, colval)
170
-        rhdf5::h5read(path(con), readname)
171
-    })
172
-    as.data.frame(res)
173
-}
174
-
175
-.getGenome <- function(con) {
176
-    gloc <- "matrix/features/genome"
177
-    gen <- unique(rhdf5::h5read(path(con), gloc))
178
-    if (length(gen) != 1L)
179
-        stop("The genome build in ", gloc, " is not consistent")
180
-    gen
181
-}
182
-
183
-#' @importFrom GenomeInfoDb genome genome<-
184
-#' @importFrom S4Vectors mcols<-
185
-.getRowRanges <- function(con) {
186
-    interval <- rhdf5::h5read(path(con), "matrix/features/interval")
187
-    interval[interval == "NA"] <- "NA_character_:0"
188
-    gr <- as(as.character(interval), "GRanges")
189
-    mcols(gr) <- .getRowDat(con)
190
-    genbuild <- rep(.getGenome(con), length(genome(gr)))
191
-    genome(gr) <- genbuild
192
-    gr
193
-}
194
-
195
-#' @import SingleCellExperiment
196
-#' @export
197
-setMethod("import", "TENxH5", function(con, format, text, ...) {
198
-    if (!requireNamespace("HDF5Array", quietly = TRUE))
199
-        stop("Install 'HDF5Array' to import TENxH5 files")
200
-    matrixdata <- HDF5Array::TENxMatrix(path(con), con@group)
201
-    if (identical(con@version, "3")) {
202
-        sce <- SingleCellExperiment::SingleCellExperiment(
203
-            assays = list(counts = matrixdata), rowRanges = .getRowRanges(con)
204
-        )
205
-        rownames(sce) <- mcols(sce)[["ID"]]
206
-        sce <- sce[seqnames(rowRanges(sce)) != "NA_character_", ]
207
-        splitAltExps(sce, rowData(sce)[["Type"]], ref = "Gene Expression")
208
-    } else {
209
-        stop("Version 2 not supported yet.")
210
-    }
211
-})
212
-
213 4
 #' @import SummarizedExperiment
214 5
 #' @export
215 6
 setMethod("import", "TENxMTX", function(con, format, text, ...) {
... ...
@@ -299,20 +90,32 @@ setMethod("import", "TENxFileList", function(con, format, text, ...) {
299 90
     )
300 91
 })
301 92
 
93
+#' Importing peak-annotation files
94
+#'
95
+#' @keywords internal
96
+#'
302 97
 #' @examples
98
+#'
303 99
 #' fi <- "~/data/10x/pbmc_3k/pbmc_granulocyte_sorted_3k_atac_peak_annotation.tsv"
304 100
 #' pa <- .import_peak_anno(fi)
101
+#'
305 102
 .import_peak_anno <- function(file) {
306 103
     stopifnot(endsWith(file, "peak_annotation.tsv"))
307 104
     panno <- readr::read_tsv(file, col_types = c("c", "n", "n", "c", "n", "c"))
308 105
     makeGRangesFromDataFrame(panno, keep.extra.columns = TRUE)
309 106
 }
310 107
 
108
+#' Importing fragment files
109
+#'
110
+#' @keywords internal
111
+#'
311 112
 #' @examples
113
+#'
312 114
 #' con <- TENxFile("~/data/10x/pbmc_3k/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5")
313 115
 #' sce <- import(con)
314 116
 #' fr <- "~/data/10x/pbmc_3k/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz"
315 117
 #' fra <- .import_fragments(fr)
118
+#'
316 119
 .import_fragments <- function(file, withIndex = TRUE) {
317 120
     ## read tabix format with Rsamtools
318 121
     stopifnot(endsWith(file, "atac_fragments.tsv.gz"))
... ...
@@ -328,7 +131,9 @@ setMethod("import", "TENxFileList", function(con, format, text, ...) {
328 131
     ## names from table in the link below
329 132
     ## https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/output/fragments
330 133
     names(ex) <- c("chrom", "chromStart", "chromEnd", "barcode", "readSupport")
331
-    ggr <- makeGRangesFromDataFrame(ex, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE)
134
+    ggr <- makeGRangesFromDataFrame(
135
+        ex, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE
136
+    )
332 137
     ggrl <- as(splitAsList(ggr, mcols(ggr)$barcode), "GRangesList")
333 138
     RaggedExperiment(ggrl)
334 139
     # compactAssay(erg, i = "readSupport", sparse = TRUE)
... ...
@@ -32,3 +32,81 @@ TENxFileList <- function(..., compressed = FALSE) {
32 32
     asl <- SimpleList(dots[[1]])
33 33
     .TENxFileList(asl, compressed = compressed)
34 34
 }
35
+
36
+#' @importFrom utils untar tail
37
+.TENxUntar <- function(con) {
38
+    dir.create(tempdir <- tempfile())
39
+    untar(path(con), exdir = tempdir)
40
+    tempdir
41
+}
42
+
43
+.readInFuns <- function(files) {
44
+    file_exts <- .get_ext(files)
45
+    lapply(.setNames(file_exts, basename(files)), function(ext) {
46
+        switch(
47
+            ext,
48
+            mtx.gz = Matrix::readMM,
49
+            tsv.gz = function(...)
50
+                readr::read_tsv(col_names = FALSE, show_col_types = FALSE, ...)
51
+        )
52
+    })
53
+}
54
+
55
+.cleanUpFuns <- function(datalist) {
56
+    if (is.null(names(datalist)))
57
+        stop("'datalist' names must correspond to originating file names")
58
+    lapply(.setNames(nm = names(datalist)), function(fname) {
59
+        switch(
60
+            fname,
61
+            features.tsv.gz = function(df) {
62
+                names(df) <- c("ID", "Symbol", "Type", "Chr", "Start", "End")
63
+                df
64
+            },
65
+            barcodes.tsv.gz = function(df) {
66
+                names(df) <- "barcode"
67
+                df
68
+            },
69
+            matrix.mtx.gz = function(mat) {
70
+                as(mat, "dgCMatrix")
71
+            },
72
+        )
73
+    })
74
+}
75
+
76
+.TENxDecompress <- function(con) {
77
+    res_ext <- .get_ext(path(con))
78
+    if (identical(res_ext, "tar.gz")) {
79
+        tenfolder <- .TENxUntar(con)
80
+        gfolder <- list.files(tenfolder, full.names = TRUE)
81
+        if (file.info(gfolder)$isdir)
82
+            gfiles <- list.files(gfolder, recursive = TRUE, full.names = TRUE)
83
+        else
84
+            gfiles <- gfolder
85
+        gdata <- Map(
86
+            f = function(reader, x) {
87
+                reader(x)
88
+            }, reader = .readInFuns(gfiles), x = gfiles
89
+        )
90
+        Map(f = function(cleaner, x) {
91
+                cleaner(x)
92
+            }, cleaner = .cleanUpFuns(gdata), x = gdata
93
+        )
94
+    } else {
95
+        stop("Extension type: ", res_ext, " not supported")
96
+    }
97
+}
98
+
99
+#' @export
100
+setMethod("import", "TENxFileList", function(con, format, text, ...) {
101
+    if (con@compressed)
102
+        fdata <- .TENxDecompress(con)
103
+    else
104
+        fdata <- con@listData
105
+    mat <- fdata[["matrix.mtx.gz"]]
106
+    colnames(mat) <- unlist(fdata[["barcodes.tsv.gz"]])
107
+    warning("Matrix of mixed types; see in rowData(x)")
108
+    SingleCellExperiment::SingleCellExperiment(
109
+        SimpleList(counts = mat),
110
+        rowData = fdata[["features.tsv.gz"]]
111
+    )
112
+})
35 113
new file mode 100644
... ...
@@ -0,0 +1,100 @@
1
+#' @importFrom MatrixGenerics rowRanges
2
+NULL
3
+
4
+#' @exportClass TENxH5
5
+.TENxH5 <- setClass(
6
+    Class = "TENxH5",
7
+    contains = "TENxFile",
8
+    slots = c(version = "character", group = "character")
9
+)
10
+
11
+.get_h5_group <- function(fpath) {
12
+    if (!requireNamespace("rhdf5", quietly = TRUE))
13
+        stop("Install 'rhdf5' to work with TENxH5")
14
+    l1 <- rhdf5::h5ls(fpath, recursive = FALSE)
15
+    l1[l1$otype == "H5I_GROUP", "name"]
16
+}
17
+
18
+.KNOWN_H5_GROUPS <- c("matrix", "outs")
19
+
20
+.check_group_ok <- function(fpath) {
21
+    gname <- .get_h5_group(fpath)
22
+    if (!gname %in% .KNOWN_H5_GROUPS)
23
+        stop("'group' not recognized")
24
+    gname
25
+}
26
+
27
+#' @rdname TENxH5
28
+#' @title Import H5 files from 10X
29
+#'
30
+#' @examples
31
+#'
32
+#' h5f <- "~/data/10x/pbmc_3k/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5"
33
+#' con <- TENxFile(h5f)
34
+#' import(con)
35
+#'
36
+#' @export
37
+TENxH5 <-
38
+    function(resource, version = c("3", "2"), ...)
39
+{
40
+    version <- match.arg(version)
41
+    group <- .check_group_ok(resource)
42
+    .TENxH5(resource = resource, group = group, version = version, ...)
43
+}
44
+
45
+gene.meta <- data.frame(
46
+    version = c("3", "2"),
47
+    ID = c("/features/id", "/genes"),
48
+    Symbol = c("/features/name", "/gene_names"),
49
+    Type = c("/features/feature_type", NA_character_)
50
+)
51
+
52
+.getRowDat <- function(con) {
53
+    gm <- gene.meta[
54
+        con@version == gene.meta[["version"]],
55
+        !names(gene.meta) %in% c("group", "version")
56
+    ]
57
+    gm[] <- Filter(Negate(is.na), gm)
58
+    res <- lapply(gm, function(colval) {
59
+        readname <- paste0(con@group, colval)
60
+        rhdf5::h5read(path(con), readname)
61
+    })
62
+    as.data.frame(res)
63
+}
64
+
65
+.getGenome <- function(con) {
66
+    gloc <- "matrix/features/genome"
67
+    gen <- unique(rhdf5::h5read(path(con), gloc))
68
+    if (length(gen) != 1L)
69
+        stop("The genome build in ", gloc, " is not consistent")
70
+    gen
71
+}
72
+
73
+#' @importFrom GenomeInfoDb genome genome<-
74
+#' @importFrom S4Vectors mcols<-
75
+.getRowRanges <- function(con) {
76
+    interval <- rhdf5::h5read(path(con), "matrix/features/interval")
77
+    interval[interval == "NA"] <- "NA_character_:0"
78
+    gr <- as(as.character(interval), "GRanges")
79
+    mcols(gr) <- .getRowDat(con)
80
+    genbuild <- rep(.getGenome(con), length(genome(gr)))
81
+    genome(gr) <- genbuild
82
+    gr
83
+}
84
+
85
+#' @import SingleCellExperiment
86
+#' @export
87
+setMethod("import", "TENxH5", function(con, format, text, ...) {
88
+    if (!requireNamespace("HDF5Array", quietly = TRUE))
89
+        stop("Install 'HDF5Array' to import TENxH5 files")
90
+    matrixdata <- HDF5Array::TENxMatrix(path(con), con@group)
91
+    if (identical(con@version, "3")) {
92
+        sce <- SingleCellExperiment::SingleCellExperiment(
93
+            assays = list(counts = matrixdata), rowRanges = .getRowRanges(con)
94
+        )
95
+        rownames(sce) <- mcols(sce)[["ID"]]
96
+        splitAltExps(sce, rowData(sce)[["Type"]], ref = "Gene Expression")
97
+    } else {
98
+        stop("Version 2 not supported yet.")
99
+    }
100
+})
... ...
@@ -1,5 +1,5 @@
1 1
 % Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/TENxFile-class.R
2
+% Please edit documentation in R/TENxH5-class.R
3 3
 \name{TENxH5}
4 4
 \alias{TENxH5}
5 5
 \title{Import H5 files from 10X}
6 6
new file mode 100644
... ...
@@ -0,0 +1,20 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/TENxFile-methods.R
3
+\name{.import_fragments}
4
+\alias{.import_fragments}
5
+\title{Importing fragment files}
6
+\usage{
7
+.import_fragments(file, withIndex = TRUE)
8
+}
9
+\description{
10
+Importing fragment files
11
+}
12
+\examples{
13
+
14
+con <- TENxFile("~/data/10x/pbmc_3k/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5")
15
+sce <- import(con)
16
+fr <- "~/data/10x/pbmc_3k/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz"
17
+fra <- .import_fragments(fr)
18
+
19
+}
20
+\keyword{internal}
0 21
new file mode 100644
... ...
@@ -0,0 +1,18 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/TENxFile-methods.R
3
+\name{.import_peak_anno}
4
+\alias{.import_peak_anno}
5
+\title{Importing peak-annotation files}
6
+\usage{
7
+.import_peak_anno(file)
8
+}
9
+\description{
10
+Importing peak-annotation files
11
+}
12
+\examples{
13
+
14
+fi <- "~/data/10x/pbmc_3k/pbmc_granulocyte_sorted_3k_atac_peak_annotation.tsv"
15
+pa <- .import_peak_anno(fi)
16
+
17
+}
18
+\keyword{internal}