Browse code

Consistent column names, add new function addImmunogenList()

Katharina Waury authored on 08/02/2025 20:00:07
Showing 8 changed files

... ...
@@ -1,6 +1,7 @@
1 1
 # Generated by roxygen2: do not edit by hand
2 2
 
3 3
 export(addImmunogen)
4
+export(addImmunogenList)
4 5
 export(evaluateImmunogen)
5 6
 export(getProteinFeatures)
6 7
 export(plotImmunogen)
... ...
@@ -43,6 +43,9 @@ addImmunogen <- function(proteinDF, start=NULL, end=NULL, seq=NULL, name) {
43 43
     # calculate immunogen end position
44 44
     end <- start + nchar(seq) - 1
45 45
 
46
+    # check if immunogen length is within range
47
+    checkImmunogenRange(start, end, proteinLength)
48
+
46 49
   # if start and end position are provided, check validity of range
47 50
   } else if (!is.null(start) & !is.null(end)) {
48 51
     checkImmunogenRange(start, end, proteinLength)
49 52
new file mode 100644
... ...
@@ -0,0 +1,50 @@
1
+#' Add multiple immunogens to the Protein DataFrame
2
+#'
3
+#' @param proteinDF Protein DataFrame created by call to getProteinFeatures()
4
+#' @param immunogenDF DataFrame where each row represents an immunogen.
5
+#'        Must contain columns: `start` (integer) and `end` (integer) or `seq` (string), and `name` (string).
6
+#'
7
+#' @description
8
+#' Calls `addImmunogen()` for each row in `immunogenDF` to add multiple immunogens
9
+#' to the given `proteinDF`.
10
+#'
11
+#' @return Updated Protein DataFrame with all immunogens added as new columns
12
+#' @export
13
+#'
14
+#' @examples
15
+#' proteinDF <- getProteinFeatures("P55087")
16
+#' immunogenDF <- data.frame(
17
+#'   start = c(10, 40, NA),
18
+#'   end = c(30, 60, NA),
19
+#'   seq = c(NA, NA, "RFKEAFSKAAQQTKGSYMEVEDNRSQVETDD"),
20
+#'   name = c("A12", "B34", "HPA"),
21
+#'   stringsAsFactors = FALSE
22
+#' )
23
+#' proteinDF <- addImmunogenList(proteinDF, immunogenDF)
24
+addImmunogenList <- function(proteinDF, immunogenDF) {
25
+
26
+  # Check if required columns exist
27
+  if (!"name" %in% colnames(immunogenDF)) {
28
+    stop("The immunogen dataframe must contain a 'name' column.")
29
+  }
30
+  if (!("seq" %in% colnames(immunogenDF)) && !all(c("start", "end") %in% colnames(immunogenDF))) {
31
+    stop("The immunogen dataframe must contain either a 'seq' column or both 'start' and 'end' columns.")
32
+  }
33
+
34
+  # Iterate through each row and add immunogen
35
+  for (i in seq_len(nrow(immunogenDF))) {
36
+
37
+    row <- immunogenDF[i, ]
38
+
39
+    # Call to addImmunogen based on available data
40
+    if (!is.na(row$start) && !is.na(row$end)) {
41
+      proteinDF <- addImmunogen(proteinDF, start = row$start, end = row$end, name = row$name)
42
+    } else if (!is.na(row$seq)) {
43
+      proteinDF <- addImmunogen(proteinDF, seq = as.character(row$seq), name = row$name)
44
+    } else {
45
+      stop("Invalid values. Please check your immunogen dataframe.")
46
+    }
47
+  }
48
+
49
+  return(proteinDF)
50
+}
... ...
@@ -8,8 +8,8 @@
8 8
 #'
9 9
 #' @description
10 10
 #' By calling `evaluateImmunogen()`, the immunogens associated with a Protein DataFrame can be evaluated regarding
11
-#' their suitability fir antibody binding in natively folded proteins. By calling the function without specifying
12
-#' an immunogen, all immunogens of the current protein will be evaluated. The summary DataFrame contains one row per
11
+#' their suitability for antibody binding in natively folded proteins. By calling the function without specifying
12
+#' an immunogen, all immunogens of the current protein dataframe will be evaluated. The summary DataFrame contains one row per
13 13
 #' evaluated immunogen.
14 14
 #'
15 15
 #' @examples
... ...
@@ -21,8 +21,8 @@
21 21
 evaluateImmunogen <- function(proteinDF, immunogen=NULL) {
22 22
 
23 23
   # feature column names of protein dataframe
24
-  features <- c("Uniprot", "Position", "Residue", "PTM", "disulfideBridge", "Membrane",
25
-                "Binding", "Disorder", "secondaryStructure", "solventAccessibility")
24
+  features <- c("Uniprot", "Position", "Residue", "PTM", "DisulfideBridge", "Membrane",
25
+                "ProteinBinding", "Disorder", "SecondaryStructure", "SolventAccessibility")
26 26
 
27 27
   fullDF <- data.frame()
28 28
 
... ...
@@ -71,8 +71,8 @@ evaluateImmunogen <- function(proteinDF, immunogen=NULL) {
71 71
 checkIfImmunogenExists <- function(colnamesDF, name) {
72 72
 
73 73
   # feature column names of protein dataframe
74
-  features <- c("Uniprot", "Position", "Residue", "PTM", "disulfideBridge", "Membrane",
75
-                "Binding", "Disorder", "secondaryStructure", "solventAccessibility")
74
+  features <- c("Uniprot", "Position", "Residue", "PTM", "DisulfideBridge", "Membrane",
75
+                "ProteinBinding", "Disorder", "SecondaryStructure", "SolventAccessibility")
76 76
 
77 77
   # raise error if immunogen names not present in dataframe or same as feature column name
78 78
   if (!(name %in% colnamesDF)) {
... ...
@@ -136,28 +136,28 @@ createSummaryDataFrame <- function(immunogenDF, immunogen) {
136 136
 
137 137
   proportionMembrane <- calculateRegionProportions(immunogenDF, "Membrane")
138 138
   proportionDisorder <- calculateRegionProportions(immunogenDF, "Disorder")
139
-  proportionBinding <- calculateRegionProportions(immunogenDF, "Binding")
139
+  proportionBinding <- calculateRegionProportions(immunogenDF, "ProteinBinding")
140 140
 
141 141
   # add secondary structure proportions
142
-  proportionsSecondaryStr <- prop.table(table(immunogenDF[["secondaryStructure"]]))
142
+  proportionsSecondaryStr <- prop.table(table(immunogenDF[["SecondaryStructure"]]))
143 143
   proportionsSecondaryStr <- addMissingClasses(proportionsSecondaryStr, c("Helix", "Sheet", "Other"))
144 144
 
145 145
   # add buried/exposed proportions
146
-  proportionsSolventAcc <- prop.table(table(immunogenDF[["solventAccessibility"]]))
146
+  proportionsSolventAcc <- prop.table(table(immunogenDF[["SolventAccessibility"]]))
147 147
   proportionsSolventAcc <- addMissingClasses(proportionsSolventAcc, c("Buried", "Exposed"))
148 148
 
149 149
   # create a summary dataframe
150 150
   summaryDF <- data.frame(
151
-    Sum_PTM = sumPTM,
152
-    Sum_DisulfideBridges = sumBridge,
153
-    Proportion_Membrane = proportionMembrane,
154
-    Proportion_Disorder = proportionDisorder,
155
-    Proportion_Binding = proportionBinding,
156
-    Proportions_Helix = proportionsSecondaryStr["Helix"][[1]],
157
-    Proportions_Sheet = proportionsSecondaryStr["Sheet"][[1]],
158
-    Proportions_Coil = proportionsSecondaryStr["Other"][[1]],
159
-    Proportions_SolventAccessibility_Buried = proportionsSolventAcc["Buried"][[1]],
160
-    Proportions_SolventAccessibility_Exposed = proportionsSolventAcc["Exposed"][[1]]
151
+    SumPTM = sumPTM,
152
+    SumDisulfideBridges = sumBridge,
153
+    ProportionMembrane = proportionMembrane,
154
+    ProportionDisorder = proportionDisorder,
155
+    ProportionBinding = proportionBinding,
156
+    ProportionsHelix = proportionsSecondaryStr["Helix"][[1]],
157
+    ProportionsSheet = proportionsSecondaryStr["Sheet"][[1]],
158
+    ProportionsCoil = proportionsSecondaryStr["Other"][[1]],
159
+    ProportionsBuried = proportionsSolventAcc["Buried"][[1]],
160
+    ProportionsExposed = proportionsSolventAcc["Exposed"][[1]]
161 161
     )
162 162
 
163 163
   # set all rownames to immunogen name
164 164
new file mode 100644
... ...
@@ -0,0 +1,32 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/addImmunogenList.R
3
+\name{addImmunogenList}
4
+\alias{addImmunogenList}
5
+\title{Add multiple immunogens to the Protein DataFrame}
6
+\usage{
7
+addImmunogenList(proteinDF, immunogenDF)
8
+}
9
+\arguments{
10
+\item{proteinDF}{Protein DataFrame created by call to getProteinFeatures()}
11
+
12
+\item{immunogenDF}{DataFrame where each row represents an immunogen.
13
+Must contain columns: `start` (integer) and `end` (integer) or `seq` (string), and `name` (string).}
14
+}
15
+\value{
16
+Updated Protein DataFrame with all immunogens added as new columns
17
+}
18
+\description{
19
+Calls `addImmunogen()` for each row in `immunogenDF` to add multiple immunogens
20
+to the given `proteinDF`.
21
+}
22
+\examples{
23
+proteinDF <- getProteinFeatures("P55087")
24
+immunogenDF <- data.frame(
25
+  start = c(10, 40, NA),
26
+  end = c(30, 60, NA),
27
+  seq = c(NA, NA, "RFKEAFSKAAQQTKGSYMEVEDNRSQVETDD"),
28
+  name = c("A12", "B34", "HPA"),
29
+  stringsAsFactors = FALSE
30
+)
31
+proteinDF <- addImmunogenList(proteinDF, immunogenDF)
32
+}
... ...
@@ -16,8 +16,8 @@ Summary DataFrame providing statistics on immunogen
16 16
 }
17 17
 \description{
18 18
 By calling `evaluateImmunogen()`, the immunogens associated with a Protein DataFrame can be evaluated regarding
19
-their suitability fir antibody binding in natively folded proteins. By calling the function without specifying
20
-an immunogen, all immunogens of the current protein will be evaluated. The summary DataFrame contains one row per
19
+their suitability for antibody binding in natively folded proteins. By calling the function without specifying
20
+an immunogen, all immunogens of the current protein dataframe will be evaluated. The summary DataFrame contains one row per
21 21
 evaluated immunogen.
22 22
 }
23 23
 \examples{
24 24
new file mode 100644
... ...
@@ -0,0 +1,29 @@
1
+library(immunogenViewer)
2
+exampleDF <- immunogenViewer::getProteinFeatures("P55087")
3
+immunogenDF <- data.frame(
4
+  start = c(10, 40, NA),
5
+  end = c(30, 60, NA),
6
+  seq = c(NA, NA, "RFKEAFSKAAQQTKGSYMEVEDNRSQVETDD"),
7
+  name = c("A12", "B34", "HPA"),
8
+  stringsAsFactors = FALSE
9
+  )
10
+
11
+
12
+test_that("Dataframe is returned if successul", {
13
+  expect_s3_class(addImmunogenList(exampleDF, immunogenDF), "data.frame")
14
+})
15
+
16
+test_that("Missing column raises errors", {
17
+  expect_error(addImmunogenList(exampleDF, data.frame(seq=c("RFKEAFSKAAQQTKGSYMEVEDNRSQVETDD"))),
18
+    "The immunogen dataframe must contain a 'name' column.")
19
+})
20
+
21
+test_that("Missing column raises errors", {
22
+  expect_error(addImmunogenList(exampleDF, data.frame(name=c("A12"))),
23
+     "The immunogen dataframe must contain either a 'seq' column or both 'start' and 'end' columns.")
24
+})
25
+
26
+test_that("Wrong immunogen dataframe raises errors", {
27
+  expect_error(addImmunogenList(exampleDF, data.frame(start = c(NA), end = c(NA), seq = c(NA), name = c("A12"))),
28
+      "Invalid values. Please check your immunogen dataframe.")
29
+})
... ...
@@ -50,6 +50,29 @@ library(immunogenViewer)
50 50
 
51 51
 To retrieve the features for the protein of interest the correct UniProt ID (also known as accession number) is required. If the UniProt ID is not known yet, one can search the [UniProtKB](https://www.uniprot.org/) using the gene or protein name. Be sure to select the UniProt ID of the correct organism and preferable search within reviewed SwissProt entries instead of unreviewed TrEMBL entries. Our example protein is the human protein TREM2 (UniProt ID: [Q9NZC2](https://www.uniprot.org/uniprotkb/Q9NZC2/entry)). Using `getProteinFeatures()` relevant features from UniProt and PredictProtein are retrieved. Interaction with UniProt is done using the Bioconductor package [UniProt.ws](https://bioconductor.org/packages/release/bioc/html/UniProt.ws.html). To see how the dataframe is structured, we will look at the returned dataframe. 
52 52
 
53
+The following protein features are included:
54
+
55
+-   Secondary Structure: Each residue is assigned as helix, sheet or other conformation based on predictions by RePROF. Source:
56
+    [PredictProtein](https://www.predictprotein.org/)
57
+
58
+-   Solvent Accessibility: Each residue is assigned as exposed or buried based on predictions by RePROF. Source:
59
+    [PredictProtein](https://www.predictprotein.org/)
60
+
61
+-   Membrane: Residues that are annotated as [transmembrane](https://www.uniprot.org/help/transmem) or
62
+    [intramembrane](https://www.uniprot.org/help/intramem). Source: [UniProtKB](https://www.uniprot.org/)
63
+
64
+-   Protein Binding: Each residue that is predicted to bind proteins based on predictions by ProNA. Source:
65
+    [PredictProtein](https://www.predictprotein.org/)
66
+
67
+-   Disorder: Each residue that is predicted to be disordered based on predictions by Meta-Disorder. Source: [PredictProtein](https://www.predictprotein.org/)
68
+
69
+-   PTM: Residues that are annotated as [modified residues](https://www.uniprot.org/help/mod_res), 
70
+    [lipidation](https://www.uniprot.org/help/lipid) or [glycosylation](https://www.uniprot.org/help/carbohyd). Source:
71
+    [UniProtKB](https://www.uniprot.org/) 
72
+    
73
+-   Disulfide Bridges: Residues that are annotated as [disulfide bonds](https://www.uniprot.org/help/disulfid). Source: [UniProtKB](https://www.uniprot.org/)
74
+
75
+To see how the dataframe is structured, we will look at the returned dataframe.
53 76
 
54 77
 ```{r get-features}
55 78
 protein <- getProteinFeatures("Q9NZC2")
... ...
@@ -71,7 +94,6 @@ protein <- addImmunogen(protein, seq = "HGQKPGTHPPSELD", name = "EB07921")
71 94
 colnames(protein)
72 95
 ```
73 96
 
74
-
75 97
 ### Renaming an immunogen
76 98
 
77 99
 Already added immunogens can be renamed using `renameImmunogen()` if the provided start and end position are correct but the name should be updated. This way a typo can be corrected or a more informative name added instead of re-adding the immunogen. The column name in the protein dataframe is then updated.
... ...
@@ -82,7 +104,6 @@ protein <- renameImmunogen(protein, oldName = "ABIN2783734_", newName = "ABIN278
82 104
 colnames(protein)
83 105
 ```
84 106
 
85
-
86 107
 ### Removing an immunogen
87 108
 
88 109
 A previously added immunogen can be removed from the protein dataframe using `removeImmunogen()`. The corresponding column is dropped from the protein dataframe.
... ...
@@ -103,7 +124,10 @@ plotProtein(protein)
103 124
 
104 125
 ## Visualizing a specific immunogen
105 126
 
106
-If interested in one specific immunogen, one can visualize the relevant part of the protein sequence. In this plot the amino acid sequence of the immunogen is shown along the x axis while the same features as in the protein plot are included. 
127
+If interested in one specific immunogen, one can visualize the relevant
128
+part of the protein sequence. In this plot the amino acid sequence of
129
+the immunogen is shown along the x axis while the same features as in
130
+the protein plot are included.
107 131
 
108 132
 ```{r visualize_immunogen, fig.fullwidth=TRUE, fig.width=10, fig.height=10, out.width = "100%"}
109 133
 plotImmunogen(protein, "ABIN2783734")
... ...
@@ -111,7 +135,30 @@ plotImmunogen(protein, "ABIN2783734")
111 135
 
112 136
 ## Evaluating the immunogens
113 137
 
114
-Apart from visualizing specific immunogens, it is also possible to summarize the protein features within a specific immunogen. This can either be done for an immunogen of interest or for all immunogens added to a protein dataframe at once. The output is a summary dataframe that can be sorted by the feature columns. By sorting the most suitable immunogen, e.g., with the highest fraction of exposed residues, can be selected.
138
+Apart from visualizing specific immunogens, it is also possible to summarize the protein features within a specific immunogen. This can either be done for one immunogen of interest or for all immunogens added to a protein dataframe at once. The output is a summary dataframe that can be sorted by the feature columns. By sorting the most suitable immunogen, e.g., with the highest fraction of exposed residues, can be selected. 
139
+
140
+The following summary statistics are included:
141
+
142
+-   SumPTM: Number of PTM residues within the immunogen.
143
+
144
+-   SumDisulfideBridges: Number of disulfide bond residues within the immunogen.
145
+
146
+-   ProportionMembrane: Proportion of immunogen residues that are annotated as membrane residues.
147
+
148
+-   ProportionDisorder: Proportion of immunogen residues that are predicted as disordered.
149
+
150
+-   ProportionBinding: Proportion of immunogen residues that are predicted to bind proteins.
151
+
152
+-   ProportionsHelix: Proportion of immunogen residues that are predicted to form alpha helices.
153
+
154
+-   ProportionsSheet: Proportion of immunogen residues that are predicted to form beta sheets.
155
+
156
+-   ProportionsCoil: Proportion of immunogen residues that are predicted to form coils.
157
+
158
+-   ProportionsBuried: Proportion of immunogen residues that are predicted to be buried.
159
+
160
+-   ProportionsExposed: Proportion of immunogen residues that are predicted to be solvent exposed.
161
+
115 162
 
116 163
 ```{r evaluate}
117 164
 immunogens <- evaluateImmunogen(protein)
... ...
@@ -121,17 +168,17 @@ DT::datatable(immunogens, width = "80%", options = list(scrollX = TRUE))
121 168
 
122 169
 # Important Notes
123 170
 
124
-* The length of an immunogen has to be between 10 and 50 amino acids. 
125
-* The secondary structure "Other" usually stand for coil structures.
126
-* For the call to `getProteinFeatures()` the taxonomy ID for the protein's species has to be set. The default is human (ID: 9606). If the protein of interest is from a different species, the correct taxonomy ID must be set as a parameter.
127
-    
128
-# References {.unnumbered}
171
+-   The length of an immunogen has to be between 10 and 50 amino acids.
172
+-   The secondary structure "Other" usually stand for coil structures.
173
+-   For the call to `getProteinFeatures()` the taxonomy ID for the
174
+    protein's species has to be set. The default is human (ID: 9606). If
175
+    the protein of interest is from a different species, the correct
176
+    taxonomy ID must be set as a parameter.
129 177
 
178
+# References {.unnumbered}
130 179
 
131 180
 # Session info
132 181
 
133 182
 ```{r sessioninfo}
134 183
 sessionInfo()
135 184
 ```
136
-
137
-