... | ... |
@@ -148,7 +148,8 @@ static double |
148 | 148 |
currFuzzVal = fuzz[object_nr]; |
149 | 149 |
for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
150 | 150 |
sum += weight[object_nr] * pow(membership_mat(object_nr, center_nr), currFuzzVal) |
151 |
- * dist_mat(object_nr, center_nr); |
|
151 |
+ |
|
152 |
+ * dist_mat(object_nr, center_nr); |
|
152 | 153 |
} |
153 | 154 |
} |
154 | 155 |
return(sum); |
... | ... |
@@ -215,13 +216,16 @@ static void |
215 | 216 |
{ |
216 | 217 |
double currFuzz = fuzz[0]; |
217 | 218 |
double k = weight_missing; |
219 |
+ |
|
220 |
+ NumericVector numerator(nr_features); |
|
221 |
+ double sum = 0; |
|
218 | 222 |
|
219 | 223 |
for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
220 | 224 |
for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) { |
221 | 225 |
// reset centers |
222 |
- centers(center_nr, feature_nr) = 0; |
|
226 |
+ std::fill(numerator.begin(), numerator.end(), 0); |
|
227 |
+ sum = 0; |
|
223 | 228 |
} |
224 |
- double sum = 0; |
|
225 | 229 |
for(int object_nr = 0; object_nr < nr_objects; object_nr++) { |
226 | 230 |
currFuzz = fuzz[object_nr]; |
227 | 231 |
double v = weight[object_nr]* pow(membership_mat(object_nr, center_nr), currFuzz) * (1 - (1 - ratio_missing_vals(object_nr)) * k); |
... | ... |
@@ -231,7 +235,7 @@ static void |
231 | 235 |
// only allow calculation if current value is not missing |
232 | 236 |
if(!missing_vals(object_nr, feature_nr)) { |
233 | 237 |
// numerator: sum(denominator * feature values) |
234 |
- centers(center_nr, feature_nr) += v * feature_mat(object_nr, feature_nr); |
|
238 |
+ numerator[feature_nr] += v * feature_mat(object_nr, feature_nr); |
|
235 | 239 |
} |
236 | 240 |
} |
237 | 241 |
|
... | ... |
@@ -239,7 +243,7 @@ static void |
239 | 243 |
sum = 1.0/sum; |
240 | 244 |
for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) { |
241 | 245 |
// new center: numerator / denominator |
242 |
- centers(center_nr, feature_nr) *= sum; |
|
246 |
+ centers(center_nr, feature_nr) = numerator[feature_nr] * sum; |
|
243 | 247 |
} |
244 | 248 |
} |
245 | 249 |
} |
... | ... |
@@ -76,6 +76,7 @@ static void |
76 | 76 |
} |
77 | 77 |
} |
78 | 78 |
|
79 |
+ |
|
79 | 80 |
static void |
80 | 81 |
object_memberships(const NumericMatrix & dist_mat, |
81 | 82 |
const int nr_centers, const NumericVector & fuzz, |
... | ... |
@@ -84,10 +85,14 @@ static void |
84 | 85 |
double sum, v; |
85 | 86 |
|
86 | 87 |
int n_of_zeroes = 0; |
88 |
+ |
|
89 |
+ // Count the number of zeroes in the dist_mat for the current object |
|
87 | 90 |
for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
88 | 91 |
if(dist_mat(object_nr, center_nr) == 0) |
89 | 92 |
n_of_zeroes++; |
90 | 93 |
} |
94 |
+ |
|
95 |
+ // If there are zeroes in the dist_mat |
|
91 | 96 |
if(n_of_zeroes > 0) { |
92 | 97 |
v = 1 / n_of_zeroes; |
93 | 98 |
for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
... | ... |
@@ -96,9 +101,8 @@ static void |
96 | 101 |
dist_mat(object_nr, center_nr) == 0 ? v: 0; |
97 | 102 |
} |
98 | 103 |
} |
104 |
+ // If there are no zeroes in the dist_mat |
|
99 | 105 |
else { |
100 |
- /* Use the assumption that in general, pow() is more |
|
101 |
- expensive than subscripting. */ |
|
102 | 106 |
sum = 0; |
103 | 107 |
for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
104 | 108 |
// check if individual fuzzifier values |
... | ... |
@@ -111,10 +115,11 @@ static void |
111 | 115 |
membership_mat(object_nr, center_nr) *= sum; |
112 | 116 |
} |
113 | 117 |
|
114 |
- sum = 0; |
|
118 |
+ /* sum = 0; |
|
115 | 119 |
double epsilon = 0.00001; |
116 | 120 |
for(int center_nr = 0; center_nr < nr_centers; center_nr++) |
117 | 121 |
sum += membership_mat(object_nr, center_nr); |
122 |
+ */ |
|
118 | 123 |
} |
119 | 124 |
|
120 | 125 |
|
... | ... |
@@ -326,7 +326,10 @@ double c_plusplus_means(const NumericMatrix & feature_mat, NumericMatrix & cente |
326 | 326 |
NumericVector & weight, NumericVector & fuzz, int dist_metric, int iter_max, double rel_tol, |
327 | 327 |
int verbose, NumericMatrix & membership_mat, double ermin, IntegerVector & iter, double missing_value = NA_REAL, |
328 | 328 |
double weight_missing = 0) { |
329 |
- |
|
329 |
+ |
|
330 |
+ // check for user interrupts |
|
331 |
+ Rcpp::checkUserInterrupt(); |
|
332 |
+ |
|
330 | 333 |
int nr_objects = feature_mat.nrow(); |
331 | 334 |
int nr_centers = centers.nrow(); |
332 | 335 |
// for symmetric matrix |
... | ... |
@@ -383,4 +386,4 @@ double c_plusplus_means(const NumericMatrix & feature_mat, NumericMatrix & cente |
383 | 386 |
|
384 | 387 |
ermin = new_fitness; |
385 | 388 |
return ermin; |
386 |
-} |
|
387 | 389 |
\ No newline at end of file |
390 |
+} |
... | ... |
@@ -348,7 +348,6 @@ double c_plusplus_means(const NumericMatrix & feature_mat, NumericMatrix & cente |
348 | 348 |
cmeans_memberships(dist_mat, nr_objects, nr_centers, fuzz, membership_mat); |
349 | 349 |
|
350 | 350 |
// calculate fitness: J(c, m) -> see literature |
351 |
- |
|
352 | 351 |
old_fitness = new_fitness = cmeans_error_fn(membership_mat, dist_mat, weight, |
353 | 352 |
nr_objects, nr_centers, fuzz); |
354 | 353 |
|
... | ... |
@@ -226,16 +226,15 @@ static void |
226 | 226 |
// only allow calculation if current value is not missing |
227 | 227 |
if(!missing_vals(object_nr, feature_nr)) { |
228 | 228 |
// numerator: sum(denominator * feature values) |
229 |
- // missing_values -> numerator * (1-missing value ratio) |
|
230 | 229 |
centers(center_nr, feature_nr) += v * feature_mat(object_nr, feature_nr); |
231 |
- //centers(center_nr, feature_nr) += v * feature_mat(object_nr, feature_nr); |
|
232 | 230 |
} |
233 | 231 |
} |
234 | 232 |
|
235 | 233 |
} |
234 |
+ sum = 1.0/sum; |
|
236 | 235 |
for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) { |
237 | 236 |
// new center: numerator / denominator |
238 |
- centers(center_nr, feature_nr) /= sum; |
|
237 |
+ centers(center_nr, feature_nr) *= sum; |
|
239 | 238 |
} |
240 | 239 |
} |
241 | 240 |
} |
... | ... |
@@ -335,12 +334,10 @@ double c_plusplus_means(const NumericMatrix & feature_mat, NumericMatrix & cente |
335 | 334 |
|
336 | 335 |
LogicalMatrix missing_vals(nr_objects, nr_features); |
337 | 336 |
NumericVector ratio_missing_vals(nr_objects); |
338 |
- |
|
339 | 337 |
fill_missing_vals_and_ratio(feature_mat, missing_vals, ratio_missing_vals, missing_value); |
340 | 338 |
|
341 | 339 |
double old_fitness, new_fitness; |
342 | 340 |
|
343 |
- |
|
344 | 341 |
NumericMatrix dist_mat = cmeans_setup(nr_objects, nr_centers, dist_metric); |
345 | 342 |
|
346 | 343 |
// calculate distances based on dist_metric(euclidean/manhattan) |
... | ... |
@@ -351,6 +348,7 @@ double c_plusplus_means(const NumericMatrix & feature_mat, NumericMatrix & cente |
351 | 348 |
cmeans_memberships(dist_mat, nr_objects, nr_centers, fuzz, membership_mat); |
352 | 349 |
|
353 | 350 |
// calculate fitness: J(c, m) -> see literature |
351 |
+ |
|
354 | 352 |
old_fitness = new_fitness = cmeans_error_fn(membership_mat, dist_mat, weight, |
355 | 353 |
nr_objects, nr_centers, fuzz); |
356 | 354 |
|
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,389 @@ |
1 |
+#include <algorithm> |
|
2 |
+#include <Rcpp.h> |
|
3 |
+#include <iostream> |
|
4 |
+#include <fstream> |
|
5 |
+using namespace Rcpp; |
|
6 |
+using namespace std; |
|
7 |
+ |
|
8 |
+// needed for weighted medians |
|
9 |
+static NumericVector dwrk, dwrk_x, dwrk_w; |
|
10 |
+static IntegerVector iwrk; |
|
11 |
+ |
|
12 |
+// for debugging purposes |
|
13 |
+void printMat(const NumericMatrix &mat) { |
|
14 |
+ for(int i = 0; i < mat.nrow(); i++) { |
|
15 |
+ for(int j = 0; j < mat(i, _).size(); j++) { |
|
16 |
+ Rcout << mat(i, j) << ", "; |
|
17 |
+ } |
|
18 |
+ Rcout << '\n'; |
|
19 |
+ } |
|
20 |
+} |
|
21 |
+ |
|
22 |
+static NumericMatrix |
|
23 |
+ cmeans_setup(int nr_objects, int nr_centers, int dist_metric) |
|
24 |
+ { |
|
25 |
+ if(dist_metric == 1) { |
|
26 |
+ // Needed for weighted medians. (manhattan) |
|
27 |
+ dwrk_x = NumericVector(nr_objects); |
|
28 |
+ dwrk_w = NumericVector(nr_objects); |
|
29 |
+ dwrk = NumericVector(nr_objects); |
|
30 |
+ iwrk = IntegerVector(nr_objects); |
|
31 |
+ } |
|
32 |
+ return NumericMatrix(nr_objects, nr_centers); |
|
33 |
+ } |
|
34 |
+ |
|
35 |
+/* Update the dissimilarities/distances (between objects and prototypes) for a |
|
36 |
+ single object (i.e., a single row of the dissimilarity matrix. */ |
|
37 |
+static void |
|
38 |
+ object_dissimilarities(const NumericMatrix& feature_mat, const NumericMatrix& centers, |
|
39 |
+ const int nr_objects, const int nr_features, const int nr_centers, |
|
40 |
+ const int dist_metric, const int object_nr, NumericMatrix& dist_mat, |
|
41 |
+ const LogicalMatrix & missing_vals) |
|
42 |
+ { |
|
43 |
+ double sum, v; |
|
44 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
|
45 |
+ sum = 0; |
|
46 |
+ v = 0; |
|
47 |
+ for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) { |
|
48 |
+ if(!missing_vals(object_nr, feature_nr)) { |
|
49 |
+ // feature_entry - center_entry |
|
50 |
+ v = feature_mat(object_nr, feature_nr) - centers(center_nr, feature_nr); |
|
51 |
+ } |
|
52 |
+ // euclidean-distance |
|
53 |
+ if(dist_metric == 0) |
|
54 |
+ sum += v * v; |
|
55 |
+ // manhattan-distance |
|
56 |
+ else if(dist_metric == 1) |
|
57 |
+ sum += fabs(v); |
|
58 |
+ |
|
59 |
+ } |
|
60 |
+ // save distance from object to center in distance matrix |
|
61 |
+ dist_mat(object_nr, center_nr) = sum; |
|
62 |
+ } |
|
63 |
+ } |
|
64 |
+ |
|
65 |
+static void |
|
66 |
+ cmeans_dissimilarities(const NumericMatrix& feature_mat, const NumericMatrix& centers, |
|
67 |
+ const int nr_objects, const int nr_features, |
|
68 |
+ const int nr_centers, const int dist_metric, |
|
69 |
+ NumericMatrix& dist_mat, const LogicalMatrix & missing_vals) |
|
70 |
+ { |
|
71 |
+ |
|
72 |
+ // loop over all objects |
|
73 |
+ for(int object_nr = 0; object_nr < nr_objects; object_nr++) { |
|
74 |
+ object_dissimilarities(feature_mat, centers, nr_objects, nr_features, |
|
75 |
+ nr_centers, dist_metric, object_nr, dist_mat, missing_vals); |
|
76 |
+ } |
|
77 |
+ } |
|
78 |
+ |
|
79 |
+static void |
|
80 |
+ object_memberships(const NumericMatrix & dist_mat, |
|
81 |
+ const int nr_centers, const NumericVector & fuzz, |
|
82 |
+ const int object_nr, NumericMatrix & membership_mat) |
|
83 |
+ { |
|
84 |
+ double sum, v; |
|
85 |
+ |
|
86 |
+ int n_of_zeroes = 0; |
|
87 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
|
88 |
+ if(dist_mat(object_nr, center_nr) == 0) |
|
89 |
+ n_of_zeroes++; |
|
90 |
+ } |
|
91 |
+ if(n_of_zeroes > 0) { |
|
92 |
+ v = 1 / n_of_zeroes; |
|
93 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
|
94 |
+ // update membership_mat only with v and on positions which are 0 in dist_mat |
|
95 |
+ membership_mat(object_nr, center_nr) = |
|
96 |
+ dist_mat(object_nr, center_nr) == 0 ? v: 0; |
|
97 |
+ } |
|
98 |
+ } |
|
99 |
+ else { |
|
100 |
+ /* Use the assumption that in general, pow() is more |
|
101 |
+ expensive than subscripting. */ |
|
102 |
+ sum = 0; |
|
103 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
|
104 |
+ // check if individual fuzzifier values |
|
105 |
+ v = pow(dist_mat(object_nr, center_nr), -1.0/(fuzz[object_nr]-1.0)); |
|
106 |
+ sum += v; |
|
107 |
+ membership_mat(object_nr, center_nr) = v; |
|
108 |
+ } |
|
109 |
+ sum = 1/sum; |
|
110 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) |
|
111 |
+ membership_mat(object_nr, center_nr) *= sum; |
|
112 |
+ } |
|
113 |
+ |
|
114 |
+ sum = 0; |
|
115 |
+ double epsilon = 0.00001; |
|
116 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) |
|
117 |
+ sum += membership_mat(object_nr, center_nr); |
|
118 |
+ } |
|
119 |
+ |
|
120 |
+ |
|
121 |
+static void |
|
122 |
+ cmeans_memberships(const NumericMatrix & dist_mat, |
|
123 |
+ const int nr_objects, const int nr_centers, |
|
124 |
+ const NumericVector & exponent, NumericMatrix & membership_mat) |
|
125 |
+ { |
|
126 |
+ // loop over all objects |
|
127 |
+ for(int object_nr = 0; object_nr < nr_objects; object_nr++) { |
|
128 |
+ object_memberships(dist_mat, nr_centers, exponent, |
|
129 |
+ object_nr, membership_mat); |
|
130 |
+ } |
|
131 |
+ } |
|
132 |
+ |
|
133 |
+static double |
|
134 |
+ cmeans_error_fn(const NumericMatrix & membership_mat, const NumericMatrix & dist_mat, |
|
135 |
+ NumericVector weight, const int nr_objects, |
|
136 |
+ const int nr_centers, const NumericVector & fuzz) |
|
137 |
+ { |
|
138 |
+ double sum; |
|
139 |
+ |
|
140 |
+ sum = 0; |
|
141 |
+ double currFuzzVal = fuzz[0]; |
|
142 |
+ for(int object_nr = 0; object_nr < nr_objects; object_nr++) { |
|
143 |
+ currFuzzVal = fuzz[object_nr]; |
|
144 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
|
145 |
+ sum += weight[object_nr] * pow(membership_mat(object_nr, center_nr), currFuzzVal) |
|
146 |
+ * dist_mat(object_nr, center_nr); |
|
147 |
+ } |
|
148 |
+ } |
|
149 |
+ return(sum); |
|
150 |
+ } |
|
151 |
+ |
|
152 |
+static void |
|
153 |
+ rsort_with_index(NumericVector & feature_values, IntegerVector & iwrk, |
|
154 |
+ const int len) |
|
155 |
+ { |
|
156 |
+ // using lambda function to sort index-vector based on values in feature-vector |
|
157 |
+ std::stable_sort(iwrk.begin(), iwrk.end(), [&](int i, int j) { |
|
158 |
+ return feature_values[i] < feature_values[j]; |
|
159 |
+ }); |
|
160 |
+ |
|
161 |
+ // sort feature vector in ascending order |
|
162 |
+ feature_values.sort(); |
|
163 |
+ } |
|
164 |
+ |
|
165 |
+static double |
|
166 |
+ cmeans_weighted_median(NumericVector & feature_values, |
|
167 |
+ NumericVector & weight, int len) |
|
168 |
+ { |
|
169 |
+ double val, mval, cumsum_w, cumsum_w_x, marg; |
|
170 |
+ |
|
171 |
+ /* Sort feature_values. */ |
|
172 |
+ for(int i = 0; i < len; i++) |
|
173 |
+ iwrk[i] = i; |
|
174 |
+ |
|
175 |
+ rsort_with_index(feature_values, iwrk, len); |
|
176 |
+ |
|
177 |
+ /* Permute weight using iwrk, and normalize. */ |
|
178 |
+ double sum = 0; |
|
179 |
+ for(int i = 0; i < len; i++) { |
|
180 |
+ dwrk[i] = weight[iwrk[i]]; |
|
181 |
+ sum += dwrk[i]; |
|
182 |
+ } |
|
183 |
+ for(int i = 0; i < len; i++) { |
|
184 |
+ weight[i] = dwrk[i] / sum; |
|
185 |
+ } |
|
186 |
+ |
|
187 |
+ cumsum_w = cumsum_w_x = 0; |
|
188 |
+ mval = R_PosInf; |
|
189 |
+ // first value is returned, if val never less than mval |
|
190 |
+ marg = feature_values[0]; |
|
191 |
+ |
|
192 |
+ for(int i = 0; i < len; i++) { |
|
193 |
+ cumsum_w += weight[i]; |
|
194 |
+ cumsum_w_x += weight[i] * feature_values[i]; |
|
195 |
+ val = feature_values[i] * (cumsum_w - .5) - cumsum_w_x; |
|
196 |
+ if(val < mval) { |
|
197 |
+ marg = feature_values[i]; |
|
198 |
+ mval = val; |
|
199 |
+ } |
|
200 |
+ } |
|
201 |
+ return(marg); |
|
202 |
+ } |
|
203 |
+ |
|
204 |
+static void |
|
205 |
+ euclidean_prototypes(const NumericMatrix & feature_mat, const NumericMatrix & membership_mat, |
|
206 |
+ const NumericVector & weight, const int nr_objects, |
|
207 |
+ const int nr_features, const int nr_centers, const NumericVector & fuzz, |
|
208 |
+ const int dist_metric, NumericMatrix & centers, const NumericVector & ratio_missing_vals, |
|
209 |
+ const LogicalMatrix & missing_vals, const double weight_missing) |
|
210 |
+ { |
|
211 |
+ double currFuzz = fuzz[0]; |
|
212 |
+ double k = weight_missing; |
|
213 |
+ |
|
214 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
|
215 |
+ for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) { |
|
216 |
+ // reset centers |
|
217 |
+ centers(center_nr, feature_nr) = 0; |
|
218 |
+ } |
|
219 |
+ double sum = 0; |
|
220 |
+ for(int object_nr = 0; object_nr < nr_objects; object_nr++) { |
|
221 |
+ currFuzz = fuzz[object_nr]; |
|
222 |
+ double v = weight[object_nr]* pow(membership_mat(object_nr, center_nr), currFuzz) * (1 - (1 - ratio_missing_vals(object_nr)) * k); |
|
223 |
+ // denominator: sum of all membership values ^ fuzz * weight |
|
224 |
+ sum += v; |
|
225 |
+ for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) { |
|
226 |
+ // only allow calculation if current value is not missing |
|
227 |
+ if(!missing_vals(object_nr, feature_nr)) { |
|
228 |
+ // numerator: sum(denominator * feature values) |
|
229 |
+ // missing_values -> numerator * (1-missing value ratio) |
|
230 |
+ centers(center_nr, feature_nr) += v * feature_mat(object_nr, feature_nr); |
|
231 |
+ //centers(center_nr, feature_nr) += v * feature_mat(object_nr, feature_nr); |
|
232 |
+ } |
|
233 |
+ } |
|
234 |
+ |
|
235 |
+ } |
|
236 |
+ for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) { |
|
237 |
+ // new center: numerator / denominator |
|
238 |
+ centers(center_nr, feature_nr) /= sum; |
|
239 |
+ } |
|
240 |
+ } |
|
241 |
+ } |
|
242 |
+ |
|
243 |
+ |
|
244 |
+ |
|
245 |
+static void |
|
246 |
+ manhattan_prototypes(const NumericMatrix & feature_mat, const NumericMatrix & membership_mat, |
|
247 |
+ const NumericVector & weight, const int nr_objects, |
|
248 |
+ const int nr_features, const int nr_centers, const NumericVector & fuzz, |
|
249 |
+ const int dist_metric, NumericMatrix & centers) |
|
250 |
+ { |
|
251 |
+ double currFuzz = fuzz[0]; |
|
252 |
+ |
|
253 |
+ for(int center_nr = 0; center_nr < nr_centers; center_nr++) { |
|
254 |
+ for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) { |
|
255 |
+ for(int object_nr = 0; object_nr < nr_objects; object_nr++) { |
|
256 |
+ currFuzz = fuzz[object_nr]; |
|
257 |
+ dwrk_x[object_nr] = feature_mat(object_nr, feature_nr); |
|
258 |
+ dwrk_w[object_nr] = weight[object_nr] * |
|
259 |
+ pow(membership_mat(object_nr, center_nr), currFuzz); |
|
260 |
+ } |
|
261 |
+ centers(center_nr, feature_nr) = |
|
262 |
+ cmeans_weighted_median(dwrk_x, dwrk_w, nr_objects); |
|
263 |
+ } |
|
264 |
+ } |
|
265 |
+ } |
|
266 |
+ |
|
267 |
+ |
|
268 |
+static void |
|
269 |
+ cmeans_prototypes(const NumericMatrix & feature_mat, NumericMatrix & membership_mat, |
|
270 |
+ const NumericVector & weight, const int nr_objects, |
|
271 |
+ const int nr_features, const int nr_centers, const NumericVector & fuzz, |
|
272 |
+ const int dist_metric, NumericMatrix & centers, const NumericVector & ratio_missing_vals, |
|
273 |
+ const LogicalMatrix & missing_vals, const double weight_missing) |
|
274 |
+ { |
|
275 |
+ if(dist_metric == 0) { |
|
276 |
+ /* Euclidean: weighted means. */ |
|
277 |
+ euclidean_prototypes(feature_mat, membership_mat, weight, nr_objects, |
|
278 |
+ nr_features, nr_centers, fuzz, dist_metric, centers, ratio_missing_vals, missing_vals, weight_missing); |
|
279 |
+ } |
|
280 |
+ else { |
|
281 |
+ /* Manhattan: weighted medians. */ |
|
282 |
+ manhattan_prototypes(feature_mat, membership_mat, weight, nr_objects, |
|
283 |
+ nr_features, nr_centers, fuzz, dist_metric, centers); |
|
284 |
+ } |
|
285 |
+ } |
|
286 |
+ |
|
287 |
+// [[Rcpp::export]] |
|
288 |
+void fill_missing_vals_and_ratio(const NumericMatrix & feature_mat, LogicalMatrix & missing_vals, |
|
289 |
+ NumericVector & ratio_missing_vals, double missing_value = NA_REAL) { |
|
290 |
+ |
|
291 |
+ // declare function pointer for checking if value is missing value based on the parameter missing_value |
|
292 |
+ bool (*missing_val_check)(double val1, double val2); |
|
293 |
+ // assign comparator function to function pointer based on missing_value |
|
294 |
+ if(NumericVector::is_na(missing_value)) { |
|
295 |
+ missing_val_check = [](double val1, double val2){return NumericVector::is_na(val2);}; |
|
296 |
+ } else { |
|
297 |
+ missing_val_check = [](double val1, double val2){return val1 == val2;}; |
|
298 |
+ } |
|
299 |
+ |
|
300 |
+ double ratio = 0; |
|
301 |
+ double nr_missing_values = 0; |
|
302 |
+ |
|
303 |
+ for(int i = 0; i < feature_mat.nrow(); i++) { |
|
304 |
+ for(int j = 0; j < feature_mat.ncol(); j++) { |
|
305 |
+ if((*missing_val_check)(missing_value, feature_mat(i,j))) { |
|
306 |
+ missing_vals(i, j) = true; |
|
307 |
+ nr_missing_values++; |
|
308 |
+ } |
|
309 |
+ } |
|
310 |
+ // calculate ratio and save in vector |
|
311 |
+ ratio = nr_missing_values/feature_mat.ncol(); |
|
312 |
+ ratio_missing_vals(i) = ratio; |
|
313 |
+ |
|
314 |
+ // reset ratio and nr_missing_values for new object |
|
315 |
+ ratio = 0; |
|
316 |
+ nr_missing_values = 0; |
|
317 |
+ } |
|
318 |
+} |
|
319 |
+ |
|
320 |
+ |
|
321 |
+ |
|
322 |
+ |
|
323 |
+// attribute tells the compiler to make this function invisible in R |
|
324 |
+// returns ermin(fitness), because the conversion from R primitive datatype to C++ double doesn't allow changes directly |
|
325 |
+// [[Rcpp::export]] |
|
326 |
+double c_plusplus_means(const NumericMatrix & feature_mat, NumericMatrix & centers, |
|
327 |
+ NumericVector & weight, NumericVector & fuzz, int dist_metric, int iter_max, double rel_tol, |
|
328 |
+ int verbose, NumericMatrix & membership_mat, double ermin, IntegerVector & iter, double missing_value = NA_REAL, |
|
329 |
+ double weight_missing = 0) { |
|
330 |
+ |
|
331 |
+ int nr_objects = feature_mat.nrow(); |
|
332 |
+ int nr_centers = centers.nrow(); |
|
333 |
+ // for symmetric matrix |
|
334 |
+ int nr_features = feature_mat.ncol(); |
|
335 |
+ |
|
336 |
+ LogicalMatrix missing_vals(nr_objects, nr_features); |
|
337 |
+ NumericVector ratio_missing_vals(nr_objects); |
|
338 |
+ |
|
339 |
+ fill_missing_vals_and_ratio(feature_mat, missing_vals, ratio_missing_vals, missing_value); |
|
340 |
+ |
|
341 |
+ double old_fitness, new_fitness; |
|
342 |
+ |
|
343 |
+ |
|
344 |
+ NumericMatrix dist_mat = cmeans_setup(nr_objects, nr_centers, dist_metric); |
|
345 |
+ |
|
346 |
+ // calculate distances based on dist_metric(euclidean/manhattan) |
|
347 |
+ cmeans_dissimilarities(feature_mat, centers, nr_objects, nr_features, |
|
348 |
+ nr_centers, dist_metric, dist_mat, missing_vals); |
|
349 |
+ |
|
350 |
+ // calculate memberships based on distances -> save in membership_mat |
|
351 |
+ cmeans_memberships(dist_mat, nr_objects, nr_centers, fuzz, membership_mat); |
|
352 |
+ |
|
353 |
+ // calculate fitness: J(c, m) -> see literature |
|
354 |
+ old_fitness = new_fitness = cmeans_error_fn(membership_mat, dist_mat, weight, |
|
355 |
+ nr_objects, nr_centers, fuzz); |
|
356 |
+ |
|
357 |
+ iter[0] = 0; |
|
358 |
+ |
|
359 |
+ // iterate iter_max-times to find cluster centers |
|
360 |
+ while(iter[0]++ < iter_max) { |
|
361 |
+ cmeans_prototypes(feature_mat, membership_mat, weight, nr_objects, nr_features, |
|
362 |
+ nr_centers, fuzz, dist_metric, centers, ratio_missing_vals, missing_vals, weight_missing); |
|
363 |
+ cmeans_dissimilarities(feature_mat, centers, nr_objects, nr_features, |
|
364 |
+ nr_centers, dist_metric, dist_mat, missing_vals); |
|
365 |
+ cmeans_memberships(dist_mat, nr_objects, nr_centers, fuzz, membership_mat); |
|
366 |
+ |
|
367 |
+ new_fitness = cmeans_error_fn(membership_mat, dist_mat, weight, |
|
368 |
+ nr_objects, nr_centers, fuzz); |
|
369 |
+ |
|
370 |
+ if(fabs(old_fitness - new_fitness) < rel_tol * (old_fitness + rel_tol)) { |
|
371 |
+ if(verbose) |
|
372 |
+ Rcout << "Iteration: " << iter << " converged with fitness: " |
|
373 |
+ << new_fitness << "\n"; |
|
374 |
+ ermin = new_fitness; |
|
375 |
+ break; |
|
376 |
+ } |
|
377 |
+ else { |
|
378 |
+ if(verbose) { |
|
379 |
+ ermin = cmeans_error_fn(membership_mat, dist_mat, weight, nr_objects, |
|
380 |
+ nr_centers, fuzz); |
|
381 |
+ Rcout << "Iteration: " << iter << " with fitness: " << new_fitness << "\n"; |
|
382 |
+ } |
|
383 |
+ old_fitness = new_fitness; |
|
384 |
+ } |
|
385 |
+ } |
|
386 |
+ |
|
387 |
+ ermin = new_fitness; |
|
388 |
+ return ermin; |
|
389 |
+} |
|
0 | 390 |
\ No newline at end of file |