mutable
A Database System for Research and Fast Prototyping
Loading...
Searching...
No Matches
Kmeans.cpp
Go to the documentation of this file.
1#include "util/Kmeans.hpp"
2
3#include <cfloat>
4#include <cmath>
5#include <limits>
7#include <random>
8
9
10using namespace m;
11using namespace Eigen;
12
13
14static constexpr unsigned KMEANS_MAX_ITERATIONS = 100;
15
16
17MatrixRXf m::kmeans_plus_plus(const MatrixXf &data, unsigned k)
18{
19 std::mt19937 g(0);
21 MatrixRXf centroids(k, data.cols());
24 VectorXf weights(data.rows());
25 weights.setConstant(std::numeric_limits<float>::max());
26
27 for (unsigned i = 0; i != k; ++i) {
28 /*----- Pick next centroid. -----*/
29 std::discrete_distribution<unsigned> dist(weights.data(), weights.data() + weights.size());
30 centroids.row(i) = data.row(dist(g));
31 /*----- Update distances. -----*/
32 weights = weights.cwiseMin((data.rowwise() - centroids.row(i)).rowwise().squaredNorm()); // cell-wise minimum
33 }
34
35 return centroids;
36}
37
38std::pair<std::vector<unsigned>, MatrixRXf> m::kmeans_with_centroids(const MatrixXf &data, unsigned k)
39{
40 M_insist(k >= 1, "kmeans requires at least one cluster");
41 if (data.size() == 0) return std::make_pair(std::vector<unsigned>(), MatrixXf(0, data.cols()));
42
43 /* Compute initial centroids via k-means++. */
44 MatrixRXf centroids = kmeans_plus_plus(data, k);
45
46 std::vector<unsigned> labels(data.rows(), 0); // the labels assigned to the data points
47 std::vector<unsigned> label_counters(k, 0); // the frequency of each label, used for iterative mean
48 bool change = true; // whether the assignment of labels changed
49
50 unsigned i = KMEANS_MAX_ITERATIONS;
51 while (change and i--) {
52 change = false;
53
54 /*----- Assignment step: Compute nearest centroid for all data points. ---------------------------------------*/
55 for (unsigned row_id = 0; row_id != data.rows(); ++row_id) {
56 unsigned label;
57 auto deltas = (centroids.rowwise() - data.row(row_id)).rowwise().squaredNorm();
58 deltas.minCoeff(&label);
59 change = change or labels[row_id] != label; // label has changed
60 labels[row_id] = label;
61 }
62
63 /*----- Update step: Compute new centroids as the mean of data points in the cluster. ------------------------*/
64 label_counters.assign(k, 0); // reset frequencies
65 centroids = MatrixRXf::Zero(centroids.rows(), centroids.cols()); // reset centroids
66 for (unsigned row_id = 0; row_id != data.rows(); ++row_id) {
67 const auto l = labels[row_id];
68 centroids.row(l) += (data.row(row_id) - centroids.row(l)) / ++label_counters[l]; // iterative mean
69 }
70 }
71
72 return std::make_pair(std::move(labels), std::move(centroids));
73}
static constexpr unsigned KMEANS_MAX_ITERATIONS
Definition: Kmeans.cpp:14
#define M_insist(...)
Definition: macro.hpp:129
‍mutable namespace
Definition: Backend.hpp:10
std::pair< std::vector< unsigned >, MatrixRXf > M_EXPORT kmeans_with_centroids(const Eigen::MatrixXf &data, unsigned k)
Clusters the given data according to the k-means algorithm.
and
Definition: enum_ops.hpp:12
MatrixRXf M_EXPORT kmeans_plus_plus(const Eigen::MatrixXf &data, unsigned k)
Compute initial cluster centroids using k-means++ algorithm.
Eigen::Matrix< float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor > MatrixRXf
Definition: Kmeans.hpp:10