... | ... |
@@ -8,7 +8,7 @@ Description: Plots protein properties and visualizes position of peptide immunog |
8 | 8 |
suitability for antibody-based methods aiming to detect native proteins. |
9 | 9 |
License: Apache License (>= 2) |
10 | 10 |
Encoding: UTF-8 |
11 |
-LazyData: true |
|
11 |
+LazyData: false |
|
12 | 12 |
Config/testthat/edition: 3 |
13 | 13 |
RoxygenNote: 7.3.1 |
14 | 14 |
VignetteBuilder: knitr |
... | ... |
@@ -18,7 +18,8 @@ Imports: |
18 | 18 |
ggplot2, |
19 | 19 |
httr, |
20 | 20 |
jsonlite, |
21 |
- patchwork |
|
21 |
+ patchwork, |
|
22 |
+ UniProt.ws |
|
22 | 23 |
Suggests: |
23 | 24 |
BiocStyle, |
24 | 25 |
knitr, |
... | ... |
@@ -1,34 +1,36 @@ |
1 | 1 |
#' Retrieve structural and functional features to create a protein DataFrame |
2 | 2 |
#' |
3 | 3 |
#' @param uniprot String, UniProt ID |
4 |
+#' @param taxId Integer, Taxonomy species ID |
|
4 | 5 |
#' |
5 | 6 |
#' @description |
6 |
-#' By providing a valid UniProt ID, information from UniProt (https://www.uniprot.org/) and PredictProtein (https://predictprotein.org/) is queried via their |
|
7 |
-#' respective APIs. The retrieved information regarding secondary structure, solvent accessibility, membrane |
|
8 |
-#' regions, protein-binding regions, disordered regions, PTMs and disulfide bridges is saved per residue |
|
9 |
-#' within a Protein DataFrame. After calling `getProteinFeatures()`, immunogens can be added to the Protein |
|
10 |
-#' DataFrame. |
|
7 |
+#' By providing a valid UniProt ID, information from UniProt (https://www.uniprot.org/) and PredictProtein |
|
8 |
+#' (https://predictprotein.org/) is queried via their respective APIs. The retrieved information regarding |
|
9 |
+#' secondary structure, solvent accessibility, membrane regions, protein-binding regions, disordered regions, |
|
10 |
+#' PTMs and disulfide bridges is saved per residue within a Protein DataFrame. |
|
11 |
+#' After calling `getProteinFeatures()`, immunogens can be added to the Protein DataFrame. |
|
11 | 12 |
#' |
12 | 13 |
#' @return Protein DataFrame |
13 | 14 |
#' @export |
14 | 15 |
#' |
16 |
+#' @import UniProt.ws |
|
17 |
+#' |
|
15 | 18 |
#' @examples getProteinFeatures("P55087") |
16 |
-getProteinFeatures <- function(uniprot) { |
|
19 |
+getProteinFeatures <- function(uniprot, taxId = 9606) { |
|
17 | 20 |
|
18 |
- # check if Uniprot ID is valid |
|
21 |
+ # check if UniProt ID is valid |
|
19 | 22 |
if (is.character(uniprot) == FALSE) { |
20 | 23 |
stop("Please provide a UniProt ID.") |
21 | 24 |
} |
22 | 25 |
|
23 | 26 |
# retrieve results from UniProt |
24 |
- resultUniprot <- accessUniprot(uniprot) |
|
25 |
- uniprotDF <- resultUniprot$features |
|
27 |
+ uniprotDF <- accessUniprot(uniprot, taxId=9606) |
|
26 | 28 |
|
27 | 29 |
# retrieve results from PredictProtein |
28 | 30 |
predictProteinDF <- accessPredictProtein(uniprot) |
29 | 31 |
|
30 | 32 |
# retrieve protein sequence |
31 |
- seq <- getProteinSequence(resultUniprot) |
|
33 |
+ seq <- uniprotDF$Sequence |
|
32 | 34 |
|
33 | 35 |
# create feature dataframe |
34 | 36 |
df <- createFeatureDataFrame(uniprot, seq, uniprotDF, predictProteinDF) |
... | ... |
@@ -38,24 +40,16 @@ getProteinFeatures <- function(uniprot) { |
38 | 40 |
} |
39 | 41 |
|
40 | 42 |
|
41 |
-accessUniprot <- function(uniprot) { |
|
42 |
- |
|
43 |
- # create protein-specific UniProt URL |
|
44 |
- url <- paste0("https://www.uniprot.org/uniprot/", uniprot, ".json") |
|
45 |
- |
|
46 |
- # make GET request |
|
47 |
- response <- httr::GET(url = url) |
|
43 |
+accessUniprot <- function(uniprot, taxId) { |
|
48 | 44 |
|
49 |
- # check if request was successful |
|
50 |
- if (response$status_code == 200) { |
|
45 |
+ # create UniProt interface |
|
46 |
+ up <- UniProt.ws(taxId=taxId) |
|
51 | 47 |
|
52 |
- # parse JSON response |
|
53 |
- result <- jsonlite::fromJSON(httr::content(response, "text", encoding="UTF-8")) |
|
54 |
- return(result) |
|
48 |
+ # retrieve sequence, PTM and disulfide bridge information |
|
49 |
+ result <- select(up, keys=uniprot, columns=c("sequence", "ft_disulfid", "ft_mod_res", "ft_lipid", "ft_carbohyd"), |
|
50 |
+ keytype="UniProtKB") |
|
55 | 51 |
|
56 |
- } else { |
|
57 |
- stop(c("Error fetching data from Uniprot for ", uniprot)) |
|
58 |
- } |
|
52 |
+ return(result) |
|
59 | 53 |
} |
60 | 54 |
|
61 | 55 |
|
... | ... |
@@ -80,22 +74,6 @@ accessPredictProtein <- function(uniprot) { |
80 | 74 |
} |
81 | 75 |
|
82 | 76 |
|
83 |
-getProteinSequence <- function(result) { |
|
84 |
- |
|
85 |
- if (is.list(result) == FALSE) { |
|
86 |
- stop("Expected a list.") |
|
87 |
- } |
|
88 |
- |
|
89 |
- seq <- as.character(result$sequence$value) |
|
90 |
- |
|
91 |
- if (is.character(seq) == FALSE || length(seq) == 0) { |
|
92 |
- stop("Expected a character.") |
|
93 |
- } |
|
94 |
- |
|
95 |
- return(seq) |
|
96 |
-} |
|
97 |
- |
|
98 |
- |
|
99 | 77 |
createFeatureDataFrame <- function(uniprot, seq, uniprotDF, predictProteinDF) { |
100 | 78 |
|
101 | 79 |
# create vectors of positions and residues |
... | ... |
@@ -127,11 +105,11 @@ createFeatureDataFrame <- function(uniprot, seq, uniprotDF, predictProteinDF) { |
127 | 105 |
proteinDF <- addPositions(proteinDF, disorderVector, "Disorder") |
128 | 106 |
|
129 | 107 |
# PTMs (UniProt) |
130 |
- PTMVector <- retrieveSinglePositions(uniprotDF, c("Modified residue", "Lipidation", "Glycosylation")) |
|
108 |
+ PTMVector <- retrievePTMPositions(uniprotDF, c("Modified.residue", "Lipidation", "Glycosylation")) |
|
131 | 109 |
proteinDF <- addPositions(proteinDF, PTMVector, "PTM") |
132 | 110 |
|
133 | 111 |
# disulfide bridges (UniProt) |
134 |
- disulfideBridgeVector <- retrieveSinglePositions(uniprotDF, c("Disulfide bond")) |
|
112 |
+ disulfideBridgeVector <- retrieveDisulfidePositions(uniprotDF$Disulfide.bond) |
|
135 | 113 |
proteinDF <- addPositions(proteinDF, disulfideBridgeVector, "disulfideBridge") |
136 | 114 |
|
137 | 115 |
return(proteinDF) |
... | ... |
@@ -154,15 +132,66 @@ collectRegionPositions <- function(start, end) { |
154 | 132 |
} |
155 | 133 |
|
156 | 134 |
|
157 |
-retrieveSinglePositions <- function(uniprotDF, columns) { |
|
135 |
+retrieveDisulfidePositions <- function(column) { |
|
158 | 136 |
|
159 |
- # filter for relevant rows in Uniprot dataframe |
|
160 |
- filteredDF <- uniprotDF[uniprotDF$type %in% columns, ] |
|
137 |
+ positions <- numeric() |
|
138 |
+ |
|
139 |
+ annotations <- unlist(strsplit(column, ";")) |
|
140 |
+ |
|
141 |
+ for (annot in annotations) { |
|
142 |
+ |
|
143 |
+ if (grepl("\\.\\.", annot)) { |
|
161 | 144 |
|
162 |
- # collect all start and end positions of PTM and disulfide bridge annotations |
|
163 |
- singlePositionsVector <- sort(c(filteredDF$location$start$value, filteredDF$location$end$value)) |
|
145 |
+ range <- unlist(strsplit(annot, "\\.\\.")) # escaping the dot character for correct string splitting |
|
146 |
+ |
|
147 |
+ start <- as.integer(gsub("[^0-9-]", "", range[1])) |
|
148 |
+ end <- as.integer(range[2]) |
|
149 |
+ |
|
150 |
+ positions <- c(positions, start, end) |
|
151 |
+ |
|
152 |
+ } else { |
|
153 |
+ |
|
154 |
+ next |
|
155 |
+ } |
|
156 |
+ } |
|
157 |
+ |
|
158 |
+ return(sort(positions)) |
|
159 |
+} |
|
160 |
+ |
|
161 |
+ |
|
162 |
+retrievePTMPositions <- function(df, columns) { |
|
163 |
+ |
|
164 |
+ positions <- numeric() |
|
165 |
+ |
|
166 |
+ for (column in columns) { |
|
167 |
+ |
|
168 |
+ # skip empty values |
|
169 |
+ if (is.na(df[[column]])) { |
|
170 |
+ |
|
171 |
+ next |
|
172 |
+ |
|
173 |
+ } else { |
|
174 |
+ |
|
175 |
+ annotations <- unlist(strsplit(df[[column]], ";")) |
|
176 |
+ |
|
177 |
+ for (annot in annotations) { |
|
178 |
+ |
|
179 |
+ # skip note and evidence strings |
|
180 |
+ if (grepl("\\/", annot)) { |
|
181 |
+ |
|
182 |
+ next |
|
183 |
+ |
|
184 |
+ } else { |
|
185 |
+ |
|
186 |
+ position <- as.integer(gsub("[^0-9-]", "", annot)) |
|
187 |
+ positions <- c(positions, position) |
|
188 |
+ |
|
189 |
+ } |
|
190 |
+ } |
|
191 |
+ } |
|
192 |
+ } |
|
164 | 193 |
|
165 |
- return(singlePositionsVector) |
|
194 |
+ return(sort(positions)) |
|
166 | 195 |
} |
167 | 196 |
|
168 | 197 |
|