Bioconductor Code: vsclust

Raw Blame Patch Log History
#include <algorithm>
#include <Rcpp.h>
#include <iostream>
#include <fstream>
using namespace Rcpp;
using namespace std;

// needed for weighted medians
static NumericVector dwrk, dwrk_x, dwrk_w;
static IntegerVector iwrk;

// for debugging purposes
void printMat(const NumericMatrix &mat) {
  for(int i = 0; i < mat.nrow(); i++) {
    for(int j = 0; j < mat(i, _).size(); j++) {
      Rcout << mat(i, j) << ", ";
    }
    Rcout << '\n';
  }
}

static NumericMatrix
  cmeans_setup(int nr_objects, int nr_centers, int dist_metric)
  {
    if(dist_metric == 1) {
      // Needed for weighted medians. (manhattan)
      dwrk_x = NumericVector(nr_objects);
      dwrk_w = NumericVector(nr_objects);
      dwrk = NumericVector(nr_objects);
      iwrk = IntegerVector(nr_objects);
    }
    return NumericMatrix(nr_objects, nr_centers);
  }

/* Update the dissimilarities/distances (between objects and prototypes) for a
 single object (i.e., a single row of the dissimilarity matrix. */
static void
  object_dissimilarities(const NumericMatrix& feature_mat, const NumericMatrix& centers,
                         const int nr_objects, const int nr_features, const int nr_centers,
                         const int dist_metric, const int object_nr, NumericMatrix& dist_mat, 
                         const LogicalMatrix & missing_vals)
  {
    double sum, v;
    for(int center_nr = 0; center_nr < nr_centers; center_nr++) {
      sum = 0;
      v = 0;
      for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) {
        if(!missing_vals(object_nr, feature_nr)) {
          // feature_entry - center_entry
          v = feature_mat(object_nr, feature_nr) - centers(center_nr, feature_nr);
        }
        // euclidean-distance
        if(dist_metric == 0)
          sum += v * v;
        // manhattan-distance
        else if(dist_metric == 1)
          sum += fabs(v);
        
      }
      // save distance from object to center in distance matrix
      dist_mat(object_nr, center_nr) = sum;
    }
  }

static void
  cmeans_dissimilarities(const NumericMatrix& feature_mat, const NumericMatrix& centers,
                         const int nr_objects, const int nr_features, 
                         const int nr_centers, const int dist_metric, 
                         NumericMatrix& dist_mat, const LogicalMatrix & missing_vals)
  {
    
    // loop over all objects
    for(int object_nr = 0; object_nr < nr_objects; object_nr++) {
      object_dissimilarities(feature_mat, centers, nr_objects, nr_features,
                             nr_centers, dist_metric, object_nr, dist_mat, missing_vals);
    }
  }

static void
  object_memberships(const NumericMatrix & dist_mat,
                     const int nr_centers, const NumericVector & fuzz,
                     const int object_nr, NumericMatrix & membership_mat)
  {
    double sum, v;
    
    int n_of_zeroes = 0;
    for(int center_nr = 0; center_nr < nr_centers; center_nr++) {
      if(dist_mat(object_nr, center_nr) == 0)
        n_of_zeroes++;
    }
    if(n_of_zeroes > 0) {
      v = 1 / n_of_zeroes;
      for(int center_nr = 0; center_nr < nr_centers; center_nr++) {
        // update membership_mat only with v and on positions which are 0 in dist_mat
        membership_mat(object_nr, center_nr) = 
          dist_mat(object_nr, center_nr) == 0 ? v: 0;
      }
    }
    else {
      /* Use the assumption that in general, pow() is more
       expensive than subscripting. */
      sum = 0;
      for(int center_nr = 0; center_nr < nr_centers; center_nr++) {
        // check if individual fuzzifier values
        v = pow(dist_mat(object_nr, center_nr), -1.0/(fuzz[object_nr]-1.0));
        sum += v;
        membership_mat(object_nr, center_nr) = v;
      }
      sum = 1/sum;
      for(int center_nr = 0; center_nr < nr_centers; center_nr++)
        membership_mat(object_nr, center_nr) *= sum;
    }
    
    sum = 0;
    double epsilon = 0.00001;
    for(int center_nr = 0; center_nr < nr_centers; center_nr++)
      sum +=  membership_mat(object_nr, center_nr);
  }


static void
  cmeans_memberships(const NumericMatrix & dist_mat,
                     const int nr_objects, const int nr_centers,
                     const NumericVector & exponent, NumericMatrix & membership_mat)
  {
    // loop over all objects
    for(int object_nr = 0; object_nr < nr_objects; object_nr++) {
      object_memberships(dist_mat, nr_centers, exponent,
                         object_nr, membership_mat);
    }
  }

static double
  cmeans_error_fn(const NumericMatrix & membership_mat, const NumericMatrix & dist_mat,
                  NumericVector weight, const int nr_objects, 
                  const int nr_centers, const NumericVector & fuzz)
  {
    double sum;
    
    sum = 0;
    double currFuzzVal = fuzz[0];
    for(int object_nr = 0; object_nr < nr_objects; object_nr++) {
      currFuzzVal = fuzz[object_nr];  
      for(int center_nr = 0; center_nr < nr_centers; center_nr++) {
        sum += weight[object_nr] * pow(membership_mat(object_nr, center_nr), currFuzzVal)
        * dist_mat(object_nr, center_nr);
      }
    }
    return(sum);
  }

static void
  rsort_with_index(NumericVector & feature_values, IntegerVector & iwrk, 
                   const int len)
  {
    // using lambda function to sort index-vector based on values in feature-vector
    std::stable_sort(iwrk.begin(), iwrk.end(), [&](int i, int j) {
      return feature_values[i] < feature_values[j];
    });
    
    // sort feature vector in ascending order
    feature_values.sort();
  }

static double
  cmeans_weighted_median(NumericVector & feature_values, 
                         NumericVector &  weight, int len)
  {
    double val, mval, cumsum_w, cumsum_w_x, marg;
    
    /* Sort feature_values. */
    for(int i = 0; i < len; i++)
      iwrk[i] = i;
    
    rsort_with_index(feature_values, iwrk, len);
    
    /* Permute weight using iwrk, and normalize. */
    double sum = 0;    
    for(int i = 0; i < len; i++) {
      dwrk[i] = weight[iwrk[i]];
      sum += dwrk[i];
    }
    for(int i = 0; i < len; i++) {
      weight[i] = dwrk[i] / sum;
    }
    
    cumsum_w = cumsum_w_x = 0;
    mval = R_PosInf;
    // first value is returned, if val never less than mval
    marg = feature_values[0];
    
    for(int i = 0; i < len; i++) {
      cumsum_w += weight[i];
      cumsum_w_x += weight[i] * feature_values[i];
      val = feature_values[i] * (cumsum_w - .5) - cumsum_w_x;
      if(val < mval) {
        marg = feature_values[i];
        mval = val;
      }
    }
    return(marg);
  }

static void 
  euclidean_prototypes(const NumericMatrix & feature_mat, const NumericMatrix & membership_mat,
                       const NumericVector & weight, const int nr_objects,
                       const int nr_features, const int nr_centers, const NumericVector & fuzz,
                       const int dist_metric, NumericMatrix & centers, const NumericVector & ratio_missing_vals, 
                       const LogicalMatrix & missing_vals, const double weight_missing) 
  {
    double currFuzz = fuzz[0];
    double k = weight_missing;
    
    for(int center_nr = 0; center_nr < nr_centers; center_nr++) {
      for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) {
        // reset centers
        centers(center_nr, feature_nr) = 0;
      }
      double sum = 0;
      for(int object_nr = 0; object_nr < nr_objects; object_nr++) {
        currFuzz = fuzz[object_nr]; 
        double v = weight[object_nr]* pow(membership_mat(object_nr, center_nr), currFuzz) * (1 - (1 - ratio_missing_vals(object_nr)) * k);
        // denominator: sum of all membership values ^ fuzz * weight
        sum += v;
        for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) {
          // only allow calculation if current value is not missing
          if(!missing_vals(object_nr, feature_nr)) {
            // numerator: sum(denominator * feature values)
            centers(center_nr, feature_nr) += v * feature_mat(object_nr, feature_nr);
          }
        }
        
      }
      sum = 1.0/sum;
      for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) {
        // new center: numerator / denominator
        centers(center_nr, feature_nr) *= sum;
      }
    }
  }



static void 
  manhattan_prototypes(const NumericMatrix & feature_mat, const NumericMatrix & membership_mat,
                       const NumericVector & weight, const int nr_objects,
                       const int nr_features, const int nr_centers, const NumericVector & fuzz,
                       const int dist_metric, NumericMatrix & centers)
  {
    double currFuzz = fuzz[0];
    
    for(int center_nr = 0; center_nr < nr_centers; center_nr++) {
      for(int feature_nr = 0; feature_nr < nr_features; feature_nr++) {
        for(int object_nr = 0; object_nr < nr_objects; object_nr++) {
          currFuzz = fuzz[object_nr]; 
          dwrk_x[object_nr] = feature_mat(object_nr, feature_nr);
          dwrk_w[object_nr] = weight[object_nr] * 
            pow(membership_mat(object_nr, center_nr), currFuzz);
        }
        centers(center_nr, feature_nr) =
          cmeans_weighted_median(dwrk_x, dwrk_w, nr_objects);
      }
    }
  }


static void
  cmeans_prototypes(const NumericMatrix & feature_mat, NumericMatrix & membership_mat,
                    const NumericVector & weight, const int nr_objects,
                    const int nr_features, const int nr_centers, const NumericVector & fuzz,
                    const int dist_metric, NumericMatrix & centers, const NumericVector & ratio_missing_vals,  
                    const LogicalMatrix & missing_vals, const double weight_missing)
  {
    if(dist_metric == 0) {
      /* Euclidean: weighted means. */
      euclidean_prototypes(feature_mat, membership_mat, weight, nr_objects,
                           nr_features, nr_centers, fuzz, dist_metric, centers, ratio_missing_vals, missing_vals, weight_missing);
    }
    else {
      /* Manhattan: weighted medians. */
      manhattan_prototypes(feature_mat, membership_mat, weight, nr_objects,
                           nr_features, nr_centers, fuzz, dist_metric, centers);
    }
  }

// [[Rcpp::export]]
void fill_missing_vals_and_ratio(const NumericMatrix & feature_mat, LogicalMatrix & missing_vals, 
                                 NumericVector & ratio_missing_vals, double missing_value = NA_REAL) {
  
  // declare function pointer for checking if value is missing value based on the parameter missing_value
  bool (*missing_val_check)(double val1, double val2);
  // assign comparator function to function pointer based on missing_value
  if(NumericVector::is_na(missing_value)) {
    missing_val_check = [](double val1, double val2){return NumericVector::is_na(val2);};
  } else {
    missing_val_check = [](double val1, double val2){return val1 == val2;};  
  }
  
  double ratio = 0;
  double nr_missing_values = 0;
  
  for(int i = 0; i < feature_mat.nrow(); i++) {
    for(int j = 0; j < feature_mat.ncol(); j++) {
      if((*missing_val_check)(missing_value, feature_mat(i,j))) {
        missing_vals(i, j) = true;
        nr_missing_values++;
      }
    }
    // calculate ratio and save in vector
    ratio = nr_missing_values/feature_mat.ncol();
    ratio_missing_vals(i) = ratio;
    
    // reset ratio and nr_missing_values for new object
    ratio = 0;
    nr_missing_values = 0;
  }
}




// attribute tells the compiler to make this function invisible in R
// returns ermin(fitness), because the conversion from R primitive datatype to C++ double doesn't allow changes directly
// [[Rcpp::export]]
double c_plusplus_means(const NumericMatrix & feature_mat, NumericMatrix & centers,
                        NumericVector & weight, NumericVector & fuzz, int dist_metric, int iter_max, double rel_tol,
                        int verbose, NumericMatrix & membership_mat, double ermin, IntegerVector & iter, double missing_value = NA_REAL, 
                        double weight_missing = 0) {

  // check for user interrupts
  Rcpp::checkUserInterrupt();

  int nr_objects = feature_mat.nrow();
  int nr_centers = centers.nrow();
  // for symmetric matrix
  int nr_features = feature_mat.ncol();
  
  LogicalMatrix missing_vals(nr_objects, nr_features);
  NumericVector ratio_missing_vals(nr_objects);
  fill_missing_vals_and_ratio(feature_mat, missing_vals, ratio_missing_vals, missing_value);

  double old_fitness, new_fitness;
  
  NumericMatrix dist_mat = cmeans_setup(nr_objects, nr_centers, dist_metric);
  
  // calculate distances based on dist_metric(euclidean/manhattan)
  cmeans_dissimilarities(feature_mat, centers, nr_objects, nr_features, 
                         nr_centers, dist_metric, dist_mat, missing_vals);
  
  // calculate memberships based on distances -> save in membership_mat
  cmeans_memberships(dist_mat, nr_objects, nr_centers, fuzz, membership_mat);
  
  // calculate fitness: J(c, m) -> see literature
  old_fitness = new_fitness = cmeans_error_fn(membership_mat, dist_mat, weight,
                                              nr_objects, nr_centers, fuzz);
  
  iter[0] = 0;
  
  // iterate iter_max-times to find cluster centers
  while(iter[0]++ < iter_max) {
    cmeans_prototypes(feature_mat, membership_mat, weight, nr_objects, nr_features,
                      nr_centers, fuzz, dist_metric, centers, ratio_missing_vals, missing_vals, weight_missing);
    cmeans_dissimilarities(feature_mat, centers, nr_objects, nr_features, 
                           nr_centers, dist_metric, dist_mat, missing_vals);
    cmeans_memberships(dist_mat, nr_objects, nr_centers, fuzz, membership_mat);

    new_fitness = cmeans_error_fn(membership_mat, dist_mat, weight, 
                                  nr_objects, nr_centers, fuzz);

    if(fabs(old_fitness - new_fitness) < rel_tol * (old_fitness + rel_tol)) {
      if(verbose)
        Rcout << "Iteration: " << iter << " converged with fitness: " 
              << new_fitness << "\n";
        ermin = new_fitness;
        break;
    }
    else {
      if(verbose) {
        ermin = cmeans_error_fn(membership_mat, dist_mat, weight, nr_objects,
                                nr_centers, fuzz);
        Rcout << "Iteration: " << iter << " with fitness: " << new_fitness << "\n";
      }
      old_fitness = new_fitness;
    }
  }
  
  ermin = new_fitness;
  return ermin;
}