// // KMeans.cpp // // Created by Marc Melikyan on 10/2/20. // #include "kmeans.h" #include "../lin_alg/lin_alg.h" #include "../utilities/utilities.h" #include #include #include MLPPKMeans::MLPPKMeans(std::vector> inputSet, int k, std::string init_type) : inputSet(inputSet), k(k), init_type(init_type) { if (init_type == "KMeans++") { kmeansppInitialization(k); } else { centroidInitialization(k); } } std::vector> MLPPKMeans::modelSetTest(std::vector> X) { MLPPLinAlg alg; std::vector> closestCentroids; for (int i = 0; i < inputSet.size(); i++) { std::vector closestCentroid = mu[0]; for (int j = 0; j < r[0].size(); j++) { bool isCentroidCloser = alg.euclideanDistance(X[i], mu[j]) < alg.euclideanDistance(X[i], closestCentroid); if (isCentroidCloser) { closestCentroid = mu[j]; } } closestCentroids.push_back(closestCentroid); } return closestCentroids; } std::vector MLPPKMeans::modelTest(std::vector x) { MLPPLinAlg alg; std::vector closestCentroid = mu[0]; for (int j = 0; j < mu.size(); j++) { if (alg.euclideanDistance(x, mu[j]) < alg.euclideanDistance(x, closestCentroid)) { closestCentroid = mu[j]; } } return closestCentroid; } void MLPPKMeans::train(int epoch_num, bool UI) { double cost_prev = 0; int epoch = 1; Evaluate(); while (true) { // STEPS OF THE ALGORITHM // 1. DETERMINE r_nk // 2. DETERMINE J // 3. DETERMINE mu_k // STOP IF CONVERGED, ELSE REPEAT cost_prev = Cost(); computeMu(); Evaluate(); // UI PORTION if (UI) { MLPPUtilities::CostInfo(epoch, cost_prev, Cost()); } epoch++; if (epoch > epoch_num) { break; } } } double MLPPKMeans::score() { return Cost(); } std::vector MLPPKMeans::silhouette_scores() { MLPPLinAlg alg; std::vector> closestCentroids = modelSetTest(inputSet); std::vector silhouette_scores; for (int i = 0; i < inputSet.size(); i++) { // COMPUTING a[i] double a = 0; for (int j = 0; j < inputSet.size(); j++) { if (i != j && r[i] == r[j]) { a += alg.euclideanDistance(inputSet[i], inputSet[j]); } } // NORMALIZE a[i] a /= closestCentroids[i].size() - 1; // COMPUTING b[i] double b = INT_MAX; for (int j = 0; j < mu.size(); j++) { if (closestCentroids[i] != mu[j]) { double sum = 0; for (int k = 0; k < inputSet.size(); k++) { sum += alg.euclideanDistance(inputSet[i], inputSet[k]); } // NORMALIZE b[i] double k_clusterSize = 0; for (int k = 0; k < closestCentroids.size(); k++) { if (closestCentroids[k] == mu[j]) { k_clusterSize++; } } if (sum / k_clusterSize < b) { b = sum / k_clusterSize; } } } silhouette_scores.push_back((b - a) / fmax(a, b)); // Or the expanded version: // if(a < b) { // silhouette_scores.push_back(1 - a/b); // } // else if(a == b){ // silhouette_scores.push_back(0); // } // else{ // silhouette_scores.push_back(b/a - 1); // } } return silhouette_scores; } // This simply computes r_nk void MLPPKMeans::Evaluate() { MLPPLinAlg alg; r.resize(inputSet.size()); for (int i = 0; i < r.size(); i++) { r[i].resize(k); } for (int i = 0; i < r.size(); i++) { std::vector closestCentroid = mu[0]; for (int j = 0; j < r[0].size(); j++) { bool isCentroidCloser = alg.euclideanDistance(inputSet[i], mu[j]) < alg.euclideanDistance(inputSet[i], closestCentroid); if (isCentroidCloser) { closestCentroid = mu[j]; } } for (int j = 0; j < r[0].size(); j++) { if (mu[j] == closestCentroid) { r[i][j] = 1; } else { r[i][j] = 0; } } } } // This simply computes or re-computes mu_k void MLPPKMeans::computeMu() { MLPPLinAlg alg; for (int i = 0; i < mu.size(); i++) { std::vector num; num.resize(r.size()); for (int i = 0; i < num.size(); i++) { num[i] = 0; } double den = 0; for (int j = 0; j < r.size(); j++) { num = alg.addition(num, alg.scalarMultiply(r[j][i], inputSet[j])); } for (int j = 0; j < r.size(); j++) { den += r[j][i]; } mu[i] = alg.scalarMultiply(double(1) / double(den), num); } } void MLPPKMeans::centroidInitialization(int k) { mu.resize(k); for (int i = 0; i < k; i++) { std::random_device rd; std::default_random_engine generator(rd()); std::uniform_int_distribution distribution(0, int(inputSet.size() - 1)); mu[i].resize(inputSet.size()); mu[i] = inputSet[distribution(generator)]; } } void MLPPKMeans::kmeansppInitialization(int k) { MLPPLinAlg alg; std::random_device rd; std::default_random_engine generator(rd()); std::uniform_int_distribution distribution(0, int(inputSet.size() - 1)); mu.push_back(inputSet[distribution(generator)]); for (int i = 0; i < k - 1; i++) { std::vector farthestCentroid; for (int j = 0; j < inputSet.size(); j++) { double max_dist = 0; /* SUM ALL THE SQUARED DISTANCES, CHOOSE THE ONE THAT'S FARTHEST AS TO SPREAD OUT THE CLUSTER CENTROIDS. */ double sum = 0; for (int k = 0; k < mu.size(); k++) { sum += alg.euclideanDistance(inputSet[j], mu[k]); } if (sum * sum > max_dist) { farthestCentroid = inputSet[j]; max_dist = sum * sum; } } mu.push_back(farthestCentroid); } } double MLPPKMeans::Cost() { MLPPLinAlg alg; double sum = 0; for (int i = 0; i < r.size(); i++) { for (int j = 0; j < r[0].size(); j++) { sum += r[i][j] * alg.norm_sq(alg.subtraction(inputSet[i], mu[j])); } } return sum; }