Browse code

Use UniProt.ws package

Katharina Waury authored on 31/05/2024 15:46:27
Showing 5 changed files

... ...
@@ -8,7 +8,7 @@ Description: Plots protein properties and visualizes position of peptide immunog
8 8
     suitability for antibody-based methods aiming to detect native proteins.
9 9
 License: Apache License (>= 2)
10 10
 Encoding: UTF-8
11
-LazyData: true
11
+LazyData: false
12 12
 Config/testthat/edition: 3
13 13
 RoxygenNote: 7.3.1
14 14
 VignetteBuilder: knitr
... ...
@@ -18,7 +18,8 @@ Imports:
18 18
     ggplot2,
19 19
     httr,
20 20
     jsonlite,
21
-    patchwork
21
+    patchwork,
22
+    UniProt.ws
22 23
 Suggests: 
23 24
     BiocStyle,
24 25
     knitr,
... ...
@@ -7,5 +7,6 @@ export(plotImmunogen)
7 7
 export(plotProtein)
8 8
 export(removeImmunogen)
9 9
 export(renameImmunogen)
10
+import(UniProt.ws)
10 11
 import(ggplot2)
11 12
 import(patchwork)
... ...
@@ -1,34 +1,36 @@
1 1
 #' Retrieve structural and functional features to create a protein DataFrame
2 2
 #'
3 3
 #' @param uniprot String, UniProt ID
4
+#' @param taxId Integer, Taxonomy species ID
4 5
 #'
5 6
 #' @description
6
-#' By providing a valid UniProt ID, information from UniProt (https://www.uniprot.org/) and PredictProtein (https://predictprotein.org/) is queried via their
7
-#' respective APIs. The retrieved information regarding secondary structure, solvent accessibility, membrane
8
-#' regions, protein-binding regions, disordered regions, PTMs and disulfide bridges is saved per residue
9
-#' within a Protein DataFrame. After calling `getProteinFeatures()`, immunogens can be added to the Protein
10
-#' DataFrame.
7
+#' By providing a valid UniProt ID, information from UniProt (https://www.uniprot.org/) and PredictProtein
8
+#' (https://predictprotein.org/) is queried via their respective APIs. The retrieved information regarding
9
+#' secondary structure, solvent accessibility, membrane regions, protein-binding regions, disordered regions,
10
+#' PTMs and disulfide bridges is saved per residue within a Protein DataFrame.
11
+#' After calling `getProteinFeatures()`, immunogens can be added to the Protein DataFrame.
11 12
 #'
12 13
 #' @return Protein DataFrame
13 14
 #' @export
14 15
 #'
16
+#' @import UniProt.ws
17
+#'
15 18
 #' @examples getProteinFeatures("P55087")
16
-getProteinFeatures <- function(uniprot) {
19
+getProteinFeatures <- function(uniprot, taxId = 9606) {
17 20
 
18
-  # check if Uniprot ID is valid
21
+  # check if UniProt ID is valid
19 22
   if (is.character(uniprot) == FALSE) {
20 23
     stop("Please provide a UniProt ID.")
21 24
   }
22 25
 
23 26
   # retrieve results from UniProt
24
-  resultUniprot <- accessUniprot(uniprot)
25
-  uniprotDF <- resultUniprot$features
27
+  uniprotDF <- accessUniprot(uniprot, taxId=9606)
26 28
 
27 29
   # retrieve results from PredictProtein
28 30
   predictProteinDF <- accessPredictProtein(uniprot)
29 31
 
30 32
   # retrieve protein sequence
31
-  seq <- getProteinSequence(resultUniprot)
33
+  seq <- uniprotDF$Sequence
32 34
 
33 35
   # create feature dataframe
34 36
   df <- createFeatureDataFrame(uniprot, seq, uniprotDF, predictProteinDF)
... ...
@@ -38,24 +40,16 @@ getProteinFeatures <- function(uniprot) {
38 40
 }
39 41
 
40 42
 
41
-accessUniprot <- function(uniprot) {
42
-
43
-  # create protein-specific UniProt URL
44
-  url <- paste0("https://www.uniprot.org/uniprot/", uniprot, ".json")
45
-
46
-  # make GET request
47
-  response <- httr::GET(url = url)
43
+accessUniprot <- function(uniprot, taxId) {
48 44
 
49
-  # check if request was successful
50
-  if (response$status_code == 200) {
45
+  # create UniProt interface
46
+  up <- UniProt.ws(taxId=taxId)
51 47
 
52
-    # parse JSON response
53
-    result <- jsonlite::fromJSON(httr::content(response, "text", encoding="UTF-8"))
54
-    return(result)
48
+  # retrieve sequence, PTM and disulfide bridge information
49
+  result <- select(up, keys=uniprot, columns=c("sequence", "ft_disulfid", "ft_mod_res", "ft_lipid", "ft_carbohyd"),
50
+    keytype="UniProtKB")
55 51
 
56
-  } else {
57
-    stop(c("Error fetching data from Uniprot for ", uniprot))
58
-  }
52
+  return(result)
59 53
 }
60 54
 
61 55
 
... ...
@@ -80,22 +74,6 @@ accessPredictProtein <- function(uniprot) {
80 74
 }
81 75
 
82 76
 
83
-getProteinSequence <- function(result) {
84
-
85
-  if (is.list(result) == FALSE) {
86
-    stop("Expected a list.")
87
-  }
88
-
89
-  seq <- as.character(result$sequence$value)
90
-
91
-  if (is.character(seq) == FALSE || length(seq) == 0) {
92
-    stop("Expected a character.")
93
-  }
94
-
95
-  return(seq)
96
-}
97
-
98
-
99 77
 createFeatureDataFrame <- function(uniprot, seq, uniprotDF, predictProteinDF) {
100 78
 
101 79
   # create vectors of positions and residues
... ...
@@ -127,11 +105,11 @@ createFeatureDataFrame <- function(uniprot, seq, uniprotDF, predictProteinDF) {
127 105
   proteinDF <- addPositions(proteinDF, disorderVector, "Disorder")
128 106
 
129 107
   # PTMs (UniProt)
130
-  PTMVector <- retrieveSinglePositions(uniprotDF, c("Modified residue", "Lipidation", "Glycosylation"))
108
+  PTMVector <- retrievePTMPositions(uniprotDF, c("Modified.residue", "Lipidation", "Glycosylation"))
131 109
   proteinDF <- addPositions(proteinDF, PTMVector, "PTM")
132 110
 
133 111
   # disulfide bridges (UniProt)
134
-  disulfideBridgeVector <- retrieveSinglePositions(uniprotDF, c("Disulfide bond"))
112
+  disulfideBridgeVector <- retrieveDisulfidePositions(uniprotDF$Disulfide.bond)
135 113
   proteinDF <- addPositions(proteinDF, disulfideBridgeVector, "disulfideBridge")
136 114
 
137 115
   return(proteinDF)
... ...
@@ -154,15 +132,66 @@ collectRegionPositions <- function(start, end) {
154 132
 }
155 133
 
156 134
 
157
-retrieveSinglePositions <- function(uniprotDF, columns) {
135
+retrieveDisulfidePositions <- function(column) {
158 136
 
159
-  # filter for relevant rows in Uniprot dataframe
160
-  filteredDF <- uniprotDF[uniprotDF$type %in% columns, ]
137
+  positions <- numeric()
138
+
139
+  annotations <- unlist(strsplit(column, ";"))
140
+
141
+  for (annot in annotations) {
142
+
143
+    if (grepl("\\.\\.", annot)) {
161 144
 
162
-  # collect all start and end positions of PTM and disulfide bridge annotations
163
-  singlePositionsVector <- sort(c(filteredDF$location$start$value, filteredDF$location$end$value))
145
+      range <- unlist(strsplit(annot, "\\.\\.")) # escaping the dot character for correct string splitting
146
+
147
+      start <- as.integer(gsub("[^0-9-]", "", range[1]))
148
+      end <- as.integer(range[2])
149
+
150
+      positions <- c(positions, start, end)
151
+
152
+    } else {
153
+
154
+      next
155
+    }
156
+  }
157
+
158
+  return(sort(positions))
159
+}
160
+
161
+
162
+retrievePTMPositions <- function(df, columns) {
163
+
164
+  positions <- numeric()
165
+
166
+  for (column in columns) {
167
+
168
+    # skip empty values
169
+    if (is.na(df[[column]])) {
170
+
171
+      next
172
+
173
+    } else {
174
+
175
+      annotations <- unlist(strsplit(df[[column]], ";"))
176
+
177
+      for (annot in annotations) {
178
+
179
+        # skip note and evidence strings
180
+        if (grepl("\\/", annot)) {
181
+
182
+          next
183
+
184
+        } else {
185
+
186
+          position <- as.integer(gsub("[^0-9-]", "", annot))
187
+          positions <- c(positions, position)
188
+
189
+        }
190
+      }
191
+    }
192
+  }
164 193
 
165
-  return(singlePositionsVector)
194
+  return(sort(positions))
166 195
 }
167 196
 
168 197
 
... ...
@@ -4,7 +4,7 @@
4 4
 \alias{getProteinFeatures}
5 5
 \title{Retrieve structural and functional features to create a protein DataFrame}
6 6
 \usage{
7
-getProteinFeatures(uniprot)
7
+getProteinFeatures(uniprot, taxId = 9606)
8 8
 }
9 9
 \arguments{
10 10
 \item{uniprot}{String, UniProt ID}
... ...
@@ -6,6 +6,7 @@ test_that("Only strings are accepted as input", {
6 6
 
7 7
 test_that("Dataframe is returned if successul", {
8 8
   expect_s3_class(getProteinFeatures("P55087"), "data.frame")
9
+  expect_s3_class(getProteinFeatures("Q9NZC2"), "data.frame")
9 10
 })
10 11
 
11 12