16std::size_t MIN_INSTANCE_SLICE = 0;
18const float RDC_THRESHOLD = 0.3f;
20MatrixXf normalize_minmax(
const MatrixXf &data)
22 const RowVectorXf mins = data.colwise().minCoeff();
23 const RowVectorXf maxs = data.colwise().maxCoeff() - mins;
25 MatrixXf normalized(data.rows(), data.cols());
26 for (
unsigned i = 0; i != data.cols(); ++i) {
27 if (maxs.array()[i] == 0)
28 normalized.col(i) = VectorXf::Zero(data.rows());
30 normalized.col(i) = (data.col(i).array() - mins.array()[i]) / maxs.array()[i];
41std::pair<std::vector<SmallBitset>, std::vector<SmallBitset>>rdc_split(
const MatrixXf &data,
SmallBitset variables)
43 const auto num_cols = data.cols();
45 std::vector<MatrixXf> CDF_matrices(num_cols);
48 for (
int i = 0; i < num_cols; i++) { CDF_matrices[i] =
create_CDF_matrix(data.col(i)); }
51 for (
unsigned i = 0; i < num_cols - 1; i++) {
52 for (
unsigned j = i+1; j < num_cols; j++) {
55 if (rdc_value >= RDC_THRESHOLD) {
56 adjacency_matrix(i,j) =
true;
57 adjacency_matrix(j,i) =
true;
63 std::vector<SmallBitset> connected_subgraphs;
67 auto next = remaining.
begin();
68 const SmallBitset CSG = adjacency_matrix.reachable(next.as_set());
69 connected_subgraphs.emplace_back(CSG);
73 std::vector<SmallBitset> variable_candidates(connected_subgraphs.size());
75 std::vector<std::size_t> temp_variables;
76 temp_variables.reserve(num_cols);
77 for (
auto it = variables.
begin(); it != variables.
end(); ++it) { temp_variables.emplace_back(*it); }
80 for (std::size_t i = 0; i < connected_subgraphs.size(); ++i) {
81 for (
auto it = connected_subgraphs[i].begin(); it != connected_subgraphs[i].end(); ++it) {
82 variable_candidates[i][temp_variables[*it]] =
true;
86 return std::make_pair(connected_subgraphs, variable_candidates);
102 float expectation_result = 0.f;
103 float likelihood_result = 0.f;
104 for (
auto &child : children) {
107 likelihood_result += child->weight *
likelihood;
110 return { expectation_result, likelihood_result };
116 unsigned nearest_centroid = 0;
117 std::size_t num_clusters = children.size();
118 float delta = (children[0]->centroid - row).squaredNorm();
119 for (std::size_t i = 1; i < num_clusters; i++) {
120 float next_delta = (children[i]->centroid - row).squaredNorm();
121 if (next_delta < delta) {
123 nearest_centroid = i;
128 children[nearest_centroid]->child->num_rows++;
131 for (std::size_t i = 0; i < num_clusters; i++) {
132 children[i]->weight = children[i]->child->num_rows / float(
num_rows);
135 children[nearest_centroid]->child->update(row, variables, update_type);
140 std::size_t result = 0;
141 for (
auto &child : children) {
142 result += child->child->estimate_number_distinct_values(
id);
150 for (std::size_t i = 0; i < children.size(); i++) {
151 for (std::size_t n = 0; n < num_tabs; n++) { out <<
"\t"; }
152 if (i != 0) { out <<
"+ "; }
153 out << children[i]->weight <<
"\n";
154 children[i]->child->print(out, num_tabs+1);
165 float expectation_result = 1.f;
166 float likelihood_result = 1.f;
167 for (
auto &child : children) {
168 for (
const auto & it : filter) {
169 if (child->variables[it.first]) {
177 return {expectation_result, likelihood_result };
182 std::unordered_map<unsigned, unsigned> variable_to_index;
184 for (
auto it = variables.
begin(); it != variables.
end(); ++it) { variable_to_index.emplace(*it, index++); }
187 for (
auto &child : children) {
188 std::size_t num_cols = child->variables.size();
189 VectorXf proj_row(num_cols);
190 auto it = child->variables.begin();
191 for (std::size_t i = 0; i < num_cols; ++i) {
192 proj_row(i) = row(variable_to_index[*it]);
195 child->child->update(proj_row, child->variables, update_type);
201 for (
auto &child : children) {
202 if (child->variables[
id]) {
203 return child->child->estimate_number_distinct_values(
id);
212 for (std::size_t i = 0; i < children.size(); i++) {
213 for (std::size_t n = 0; n < num_tabs; n++) { out <<
"\t"; }
214 if (i != 0) { out <<
"* "; }
215 out <<
"variable scope=(";
216 for (
auto it = children[i]->variables.
begin(); it != children[i]->variables.end(); ++it) { out << *it; }
218 children[i]->child->print(out, num_tabs+1);
229 auto [spn_operator, value] = filter.at(leaf_id);
231 if (spn_operator ==
IS_NULL) {
return { null_probability, null_probability }; }
233 if (bins.empty()) {
return { 0.f, 0.f }; }
236 float expectation = bins[0].cumulative_probability * bins[0].value;
237 float prev_probability = bins[0].cumulative_probability;
238 for (std::size_t bin_id = 1; bin_id < bins.size(); bin_id++) {
239 float cumulative_probability = bins[bin_id].cumulative_probability;
240 float probability = cumulative_probability - prev_probability;
242 prev_probability = cumulative_probability;
247 float last_prob = std::prev(bins.end())->cumulative_probability;
248 float probability = 0.f;
251 if (bins.begin()->value > value or (std::prev(bins.end()))->value < value) {
return { 0.f, 0.f }; }
252 auto lower_bound = std::lower_bound(bins.begin(), bins.end(), value);
257 return { probability, probability };
261 if (bins.begin()->value >= value) {
return { 0.f, 0.f }; }
262 if ((std::prev(bins.end()))->value < value) {
return { last_prob, last_prob }; }
263 auto lower_bound = std::lower_bound(bins.begin(), bins.end(), value);
264 probability = (--
lower_bound)->cumulative_probability;
265 return { probability, probability };
269 if (bins.begin()->value > value) {
return { 0.f, 0.f }; }
270 if ((std::prev(bins.end()))->value <= value) {
return { last_prob, last_prob }; }
271 auto upper_bound = std::upper_bound(bins.begin(), bins.end(), value,
274 probability = (--
upper_bound)->cumulative_probability;
275 return { probability, probability };
279 if (bins.begin()->value > value) {
return { last_prob, last_prob }; }
280 if ((std::prev(bins.end()))->value <= value) {
return { 0.f, 0.f }; }
281 auto upper_bound = std::upper_bound(bins.begin(), bins.end(), value,
284 probability = last_prob - (--
upper_bound)->cumulative_probability;
285 return { probability, probability };
289 if (bins.begin()->value >= value) {
return { last_prob, last_prob }; }
290 if ((std::prev(bins.end()))->value < value) {
return { 0.f, 0.f }; }
291 auto lower_bound = std::lower_bound(bins.begin(), bins.end(), value);
292 probability = last_prob - (--
lower_bound)->cumulative_probability;
293 return { probability, probability };
301 const float value = row(0);
304 if (update_type ==
INSERT) {
305 bins.emplace_back(value, 1.f/(
num_rows + 1));
313 std::vector<Bin> updated_bins;
314 updated_bins.reserve(bins.size());
315 updated_bins.emplace_back(bins[0].value, bins[0].cumulative_probability *
num_rows);
316 for (std::size_t i = 1; i < bins.size(); i++) {
317 updated_bins.emplace_back(
319 (bins[i].cumulative_probability - bins[i-1].cumulative_probability) *
num_rows
324 if (update_type ==
INSERT) {
325 auto lower_bound = std::lower_bound(updated_bins.begin(), updated_bins.end(), value);
326 if (
lower_bound == updated_bins.end()) { updated_bins.emplace_back(value, 1.f); }
333 auto lower_bound = std::lower_bound(updated_bins.begin(), updated_bins.end(), value);
340 updated_bins[0].cumulative_probability /= float(
num_rows);
341 for (std::size_t i = 1; i < updated_bins.size(); i++) {
342 updated_bins[i].cumulative_probability /= float(
num_rows);
343 updated_bins[i].cumulative_probability += updated_bins[i-1].cumulative_probability;
346 bins = std::move(updated_bins);
356 for (std::size_t n = 0; n < num_tabs; n++) { out <<
"\t"; }
358 if (not bins.empty()) {
359 out << bins[0].value <<
":" << bins[0].cumulative_probability;
360 for (std::size_t i = 1; i < bins.size(); i++) {
361 out <<
", " << bins[i].value <<
":" << bins[i].cumulative_probability - bins[i-1].cumulative_probability;
365 out <<
"null_prob:" << null_probability;
377 auto [spn_operator, value] = filter.at(leaf_id);
378 float probability = 0.f;
379 if (spn_operator ==
IS_NULL) {
return { null_probability, null_probability }; }
380 if (bins.empty()) {
return { 0.f, 0.f }; }
385 float prev_probability = bins[0].cumulative_probability;
386 float prev_upper_bound = bins[0].upper_bound;
387 for (std::size_t bin_id = 1; bin_id < bins.size(); bin_id++) {
389 float cumulative_probability = bins[bin_id].cumulative_probability;
390 float current_probability = cumulative_probability - prev_probability;
392 prev_probability = cumulative_probability;
399 if (
lower_bound > value or (std::prev(bins.end()))->upper_bound < value) {
return { 0.f, 0.f }; }
400 if (
lower_bound == value) {
return { lower_bound_probability, lower_bound_probability }; }
402 probability = bins[0].cumulative_probability - lower_bound_probability;
403 return { probability, probability };
405 auto std_lower_bound = std::lower_bound(bins.begin(), bins.end(), value);
406 probability = std_lower_bound->cumulative_probability - (--std_lower_bound)->cumulative_probability;
407 return { probability, probability };
413 return { lower_bound_probability, lower_bound_probability };
417 if ((std::prev(bins.end()))->upper_bound <= value) {
418 probability = std::prev(bins.end())->cumulative_probability;
419 return { probability, probability };
422 float prob = bins[0].cumulative_probability - lower_bound_probability;
423 probability = lower_bound_probability +
425 return { probability, probability };
427 auto std_lower_bound = std::lower_bound(bins.begin(), bins.end(), value);
428 auto &bin = *std_lower_bound;
429 auto &prev_bin = *(--std_lower_bound);
430 float bin_probability = bin.cumulative_probability - prev_bin.cumulative_probability;
431 float percentile = (value - prev_bin.upper_bound) / (bin.upper_bound - prev_bin.upper_bound);
434 probability = prev_bin.cumulative_probability + (bin_probability * percentile);
436 probability = bin.cumulative_probability;
438 probability = prev_bin.cumulative_probability;
440 return { probability, probability };
444 float last_prob = std::prev(bins.end())->cumulative_probability;
447 probability = last_prob - lower_bound_probability;
448 return { probability, probability };
450 return { last_prob, last_prob };
452 if ((std::prev(bins.end()))->upper_bound <= value) {
return { 0.f, 0.f }; }
454 float prob = bins[0].cumulative_probability - lower_bound_probability;
456 probability = last_prob - bins[0].cumulative_probability + split_bin_prob;
457 return { probability, probability };
459 auto std_lower_bound = std::lower_bound(bins.begin(), bins.end(), value);
460 auto &bin = *std_lower_bound;
461 auto &prev_bin = *(--std_lower_bound);
462 float bin_probability = bin.cumulative_probability - prev_bin.cumulative_probability;
463 float percentile = 1.f - ((value - prev_bin.upper_bound) / (bin.upper_bound - prev_bin.upper_bound));
466 probability = last_prob - bin.cumulative_probability + (bin_probability * percentile);
468 probability = last_prob - prev_bin.cumulative_probability;
470 probability = last_prob - bin.cumulative_probability;
472 return { probability, probability };
480 const float value = row(0);
483 if (update_type ==
INSERT) {
484 bins.emplace_back(value, 1.f/(
num_rows + 1));
492 std::vector<Bin> updated_bins;
493 float updated_lower_bound_prob = lower_bound_probability *
num_rows;
494 updated_bins.reserve(bins.size());
495 updated_bins.emplace_back(
497 (bins[0].cumulative_probability - lower_bound_probability) *
num_rows
499 for (std::size_t i = 1; i < bins.size(); i++) {
500 updated_bins.emplace_back(
502 (bins[i].cumulative_probability - bins[i-1].cumulative_probability) *
num_rows
507 if (update_type ==
INSERT) {
509 updated_lower_bound_prob += 1;
511 updated_bins[0].cumulative_probability += updated_lower_bound_prob;
513 updated_lower_bound_prob = 1.f;
514 }
else if (value > updated_bins[updated_bins.size() - 1].upper_bound) {
515 updated_bins.emplace_back(value, 1.f);
517 auto std_lower_bound = std::lower_bound(updated_bins.begin(), updated_bins.end(), value);
518 std_lower_bound->cumulative_probability += 1;
524 if (value < lower_bound or value > updated_bins[updated_bins.size() - 1].upper_bound) {
return; }
526 updated_lower_bound_prob += 1;
528 auto std_lower_bound = std::lower_bound(updated_bins.begin(), updated_bins.end(), value);
529 if (std_lower_bound->cumulative_probability == 0) {
return; }
530 std_lower_bound->cumulative_probability -= 1;
536 updated_lower_bound_prob /= float(
num_rows);
537 updated_bins[0].cumulative_probability /= float(
num_rows);
538 updated_bins[0].cumulative_probability += updated_lower_bound_prob;
539 for (std::size_t i = 1; i < updated_bins.size(); i++) {
540 updated_bins[i].cumulative_probability /= float(
num_rows);
541 updated_bins[i].cumulative_probability += updated_bins[i-1].cumulative_probability;
543 lower_bound_probability = updated_lower_bound_prob;
544 bins = std::move(updated_bins);
554 for (std::size_t n = 0; n < num_tabs; n++) { out <<
"\t"; }
556 if (not bins.empty()) {
557 out << lower_bound_probability <<
": ";
559 out <<
" :" << bins[0].cumulative_probability - lower_bound_probability <<
": " << bins[0].upper_bound;
560 for (std::size_t i = 1; i < bins.size(); i++) {
561 out <<
" :" << bins[i].cumulative_probability - bins[i-1].cumulative_probability;
562 out <<
": " << bins[i].upper_bound;
566 out <<
"null_prob:" << null_probability;
579 std::vector<std::unique_ptr<Product::ChildWithVariables>> children;
581 for (
auto i = 0; i < ld.
data.cols(); i++) {
582 const MatrixXf &data = ld.
data.col(i);
583 const MatrixXf &normalized = ld.
normalized.col(i);
584 const MatrixXi &null_matrix = ld.
null_matrix.col(i);
587 std::vector<LeafType> split_leaf_types{ld.
leaf_types[i]};
595 auto child = std::make_unique<Product::ChildWithVariables>(
learn_node(split_data), variables);
596 children.push_back(std::move(child));
598 return std::make_unique<Product>(std::move(children), ld.
data.rows());
604 std::vector<SmallBitset> &column_candidates,
605 std::vector<SmallBitset> &variable_candidates
608 std::vector<std::unique_ptr<Product::ChildWithVariables>> children;
609 for (std::size_t current_split = 0; current_split < column_candidates.size(); current_split++) {
610 std::size_t split_size = column_candidates[current_split].size();
611 std::vector<LeafType> split_leaf_types;
612 split_leaf_types.reserve(split_size);
613 std::vector<unsigned> column_index;
614 column_index.reserve(split_size);
615 for (
auto it = column_candidates[current_split].begin(); it != column_candidates[current_split].end(); ++it) {
616 split_leaf_types.push_back(ld.
leaf_types[*it]);
617 column_index.push_back(*it);
620 const MatrixXf &data = ld.
data(all, column_index);
621 const MatrixXf &normalized = ld.
normalized(all, column_index);
622 const MatrixXi &null_matrix = ld.
null_matrix(all, column_index);
623 LearningData split_data(data, normalized, null_matrix, variable_candidates[current_split], split_leaf_types);
624 auto child = std::make_unique<Product::ChildWithVariables>(
626 variable_candidates[current_split]
628 children.push_back(std::move(child));
630 return std::make_unique<Product>(std::move(children), ld.
data.rows());
639 unsigned prev_num_split_nodes = 0;
640 std::vector<std::vector<SmallBitset>> prev_cluster_column_candidates;
641 std::vector<std::vector<SmallBitset>> prev_cluster_variable_candidates;
642 std::vector<std::vector<unsigned>> prev_cluster_row_ids;
647 unsigned num_split_nodes = 0;
651 std::vector<std::vector<SmallBitset>> cluster_column_candidates(k);
652 std::vector<std::vector<SmallBitset>> cluster_variable_candidates(k);
653 std::vector<std::vector<unsigned>> cluster_row_ids(k);
655 for (
unsigned current_row = 0; current_row <
num_rows; current_row++) {
656 cluster_row_ids[labels[current_row]].push_back(current_row);
660 bool only_one_cluster = not cluster_row_ids[0].empty();
661 for (
int i = 1; i < k; i++)
662 only_one_cluster = only_one_cluster
and cluster_row_ids[i].empty();
664 if (only_one_cluster) {
665 const std::size_t subvector_size = cluster_row_ids[0].size() / k;
666 std::vector<std::vector<unsigned>> new_cluster_row_ids(k);
667 for (std::size_t cluster_id = 0; cluster_id < k - 1; cluster_id++) {
668 std::vector<unsigned> subvector(
669 cluster_row_ids[0].begin() + (cluster_id * subvector_size),
670 cluster_row_ids[0].begin() + ((cluster_id + 1) * subvector_size)
672 new_cluster_row_ids[cluster_id] = std::move(subvector);
674 std::vector<unsigned> last_subvector(
675 cluster_row_ids[0].begin() + ((k - 1) * subvector_size),
676 cluster_row_ids[0].end()
678 new_cluster_row_ids[k - 1] = std::move(last_subvector);
680 cluster_row_ids = std::move(new_cluster_row_ids);
684 for (
unsigned label_id = 0; label_id < k; label_id++) {
685 std::size_t cluster_size = cluster_row_ids[label_id].size();
686 if (cluster_size == 0) {
continue; }
688 const MatrixXf &data = ld.
data(cluster_row_ids[label_id], all);
690 if (cluster_size <= MIN_INSTANCE_SLICE) {
692 cluster_column_candidates[label_id] = std::vector<SmallBitset>();
693 cluster_variable_candidates[label_id] = std::vector<SmallBitset>();
695 auto [current_column_candidates, current_variable_candidates] = rdc_split(data, ld.
variables);
696 if (current_column_candidates.size() > 1) { num_split_nodes++; }
697 cluster_column_candidates[label_id] = std::move(current_column_candidates);
698 cluster_variable_candidates[label_id] = std::move(current_variable_candidates);
704 ((num_split_nodes <= prev_num_split_nodes or prev_num_split_nodes == prev_cluster_row_ids.size())
705 and prev_num_split_nodes != 0) or k >= MAX_K
707 std::vector<std::unique_ptr<Sum::ChildWithWeight>> children;
708 for (std::size_t cluster_id = 0; cluster_id < k - 1; cluster_id++) {
709 const MatrixXf &data = ld.
data(prev_cluster_row_ids[cluster_id], all);
710 const MatrixXf &normalized = ld.
normalized(prev_cluster_row_ids[cluster_id], all);
711 const MatrixXi &null_matrix = ld.
null_matrix(prev_cluster_row_ids[cluster_id], all);
713 const float weight = float(data.rows())/float(
num_rows);
714 std::size_t cluster_vertical_partitions = prev_cluster_column_candidates[cluster_id].size();
718 std::unique_ptr<Node> child_node;
719 if (cluster_vertical_partitions == 0) {
721 }
else if (cluster_vertical_partitions == 1) {
726 prev_cluster_column_candidates[cluster_id],
727 prev_cluster_variable_candidates[cluster_id]
730 std::unique_ptr<Sum::ChildWithWeight> child = std::make_unique<Sum::ChildWithWeight>(
731 std::move(child_node),
733 prev_centroids.row(cluster_id)
736 children.push_back(std::move(child));
739 return std::make_unique<Sum>(std::move(children),
num_rows);
741 prev_num_split_nodes = num_split_nodes;
742 prev_cluster_column_candidates = std::move(cluster_column_candidates);
743 prev_cluster_variable_candidates = std::move(cluster_variable_candidates);
744 prev_cluster_row_ids = std::move(cluster_row_ids);
745 prev_centroids = centroids;
754 const auto num_cols = ld.
data.cols();
759 std::vector<DiscreteLeaf::Bin> bins;
762 return std::make_unique<DiscreteLeaf>(std::move(bins), 1.f,
num_rows);
764 unsigned null_counter = 0;
765 for (
auto i = 0; i <
num_rows; i++) {
770 const auto current_value = ld.
data(i, 0);
772 auto lower_bound = std::lower_bound(bins.begin(), bins.end(), current_value);
773 if (
lower_bound == bins.end()) { bins.emplace_back(current_value, 1.f); }
776 else { bins.emplace(
lower_bound, current_value, 1.f); }
779 bins[0].cumulative_probability /= float(
num_rows);
780 for (std::size_t i = 1; i < bins.size(); i++) {
781 bins[i].cumulative_probability /= float(
num_rows);
782 bins[i].cumulative_probability += bins[i-1].cumulative_probability;
784 return std::make_unique<DiscreteLeaf>(std::move(bins), null_counter /
float(
num_rows),
num_rows);
786 std::vector<ContinuousLeaf::Bin> bins;
789 return std::make_unique<ContinuousLeaf>(std::move(bins), 0.f, 0.f, 1.f,
num_rows);
791 const float max = ld.
data.maxCoeff();
792 const float min = ld.
data.minCoeff();
793 const auto num_bins = 1 + log2_ceil(std::size_t(
num_rows));
794 const float bin_width = (max - min) /
float(num_bins);
796 unsigned lower_bound_counter = 0.f;
797 unsigned null_counter = 0.f;
800 bins.reserve(num_bins);
801 for(std::size_t bin_id = 0; bin_id < num_bins; bin_id++) {
802 bins.emplace_back(min + (
float(bin_id + 1) * bin_width), 0.f);
806 for (
unsigned i = 0; i <
num_rows; i++) {
811 const auto current_value = ld.
data(i, 0);
813 lower_bound_counter++;
816 auto std_lower_bound = std::lower_bound(bins.begin(), bins.end(), current_value);
817 std_lower_bound->cumulative_probability++;
820 float lower_bound_probability = lower_bound_counter / float(
num_rows);
821 bins[0].cumulative_probability /= float(
num_rows);
822 bins[0].cumulative_probability += lower_bound_probability;
824 for (std::size_t i = 1; i < bins.size(); i++) {
825 bins[i].cumulative_probability /= float(
num_rows);
826 bins[i].cumulative_probability += bins[i-1].cumulative_probability;
828 return std::make_unique<ContinuousLeaf>(
831 lower_bound_probability,
842 auto [column_candidates, variable_candidates] = rdc_split(ld.
data, ld.
variables);
843 if (column_candidates.size() != 1) {
return create_product_rdc(ld, column_candidates, variable_candidates); }
851Spn Spn::learn_spn(Eigen::MatrixXf &data, Eigen::MatrixXi &null_matrix, std::vector<LeafType> &leaf_types)
854 MIN_INSTANCE_SLICE = std::max<std::size_t>((0.1 *
num_rows), 1);
857 std::vector<DiscreteLeaf::Bin> bins;
858 return Spn(0, std::make_unique<DiscreteLeaf>(std::move(bins), 0, 0));
862 for (std::size_t col_id = 0; col_id < data.cols(); col_id++) {
863 if (null_matrix.col(col_id).maxCoeff() == 0) {
continue; }
864 if (null_matrix.col(col_id).minCoeff() == 1) {
continue; }
866 int num_not_null = 0;
867 for (std::size_t row_id = 0; row_id < data.rows(); row_id++) {
868 if (null_matrix(row_id, col_id) == 1) {
continue; }
869 mean += (data(row_id, col_id) - mean) / ++num_not_null;
871 for (std::size_t row_id = 0; row_id < data.rows(); row_id++) {
872 if (null_matrix(row_id, col_id) == 1) { data(row_id, col_id) = mean; }
878 auto normalized = normalize_minmax(data);
884 std::move(leaf_types)
895 root_->update(row, variables, update_type);
915 auto filter_copy = filter;
916 filter_copy.emplace(attribute_id, std::make_pair(
EXPECTATION, 0.f));
920 if (!filter.empty()) {
925 return cond_expectation / llh;
948 return root_->estimate_number_distinct_values(attribute_id);
Eigen::MatrixXf create_CDF_matrix(const Eigen::MatrixXf &data)
Creates a new matrix, mapping every cell of data to its position according to the cell's column's CDF...
std::pair< std::vector< unsigned >, MatrixRXf > M_EXPORT kmeans_with_centroids(const Eigen::MatrixXf &data, unsigned k)
Clusters the given data according to the k-means algorithm.
float rdc_precomputed_CDF(Eigen::MatrixXf &CDFs_of_X, Eigen::MatrixXf &CDFs_of_Y, unsigned k=5, float stddev=1.f/6.f, URBG &&g=URBG())
Compute the RDC value without having to compute CDF matrices to avoid sorting the same data multiple ...
Eigen::Matrix< float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor > MatrixRXf
An adjacency matrix for a given query graph.
Implements a small and efficient set over integers in the range of 0 to 63 (including).
static SmallBitset All(std::size_t n)
Factory method for creating a SmallBitset with first n bits set.
std::size_t estimate_number_distinct_values(unsigned id) const override
void print(std::ostream &out, std::size_t num_tabs) const override
std::pair< float, float > evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override
Evaluate the SPN bottom up with a filter condition.
void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override
void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override
void print(std::ostream &out, std::size_t num_tabs) const override
std::pair< float, float > evaluate(const Filter &bin_value, unsigned leaf_id, EvalType eval_type) const override
Evaluate the SPN bottom up with a filter condition.
std::size_t estimate_number_distinct_values(unsigned id) const override
const Eigen::MatrixXf & data
const Eigen::MatrixXf & normalized
std::vector< LeafType > leaf_types
const Eigen::MatrixXi & null_matrix
std::pair< float, float > evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override
Evaluate the SPN bottom up with a filter condition.
void print(std::ostream &out, std::size_t num_tabs) const override
void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override
std::size_t estimate_number_distinct_values(unsigned id) const override
void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override
std::size_t estimate_number_distinct_values(unsigned id) const override
void print(std::ostream &out, std::size_t num_tabs) const override
std::pair< float, float > evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override
Evaluate the SPN bottom up with a filter condition.
Tree structure for Sum Product Networks.
static Spn learn_spn(Eigen::MatrixXf &data, Eigen::MatrixXi &null_matrix, std::vector< LeafType > &leaf_types)
Learn an SPN over the given data.
float lower_bound(const Filter &filter) const
Compute the lower bound probability for continuous domains.
void update_row(Eigen::VectorXf &old_row, Eigen::VectorXf &updated_row)
Update the SPN with the given row.
float likelihood(const Filter &filter) const
Compute the likelihood of the given filter predicates given by a map from attribute to the respective...
void delete_row(Eigen::VectorXf &row)
Delete the given row from the SPN.
float upper_bound(const Filter &filter) const
Compute the upper bound probability for continuous domains.
std::size_t estimate_number_distinct_values(unsigned attribute_id) const
Estimate the number of distinct values of the given attribute.
float expectation(unsigned attribute_id, const Filter &filter) const
Compute the expectation of the given attribute.
static std::unique_ptr< Spn::Product > create_product_min_slice(LearningData &learning_data)
Create a product node by splitting all columns.
std::size_t num_rows() const
returns the number of rows in the SPN.
void insert_row(Eigen::VectorXf &row)
Insert the given row into the SPN.
static std::unique_ptr< Node > learn_node(LearningData &learning_data)
Recursively learns the nodes of an SPN.
std::unique_ptr< Node > root_
std::unordered_map< unsigned, std::pair< SpnOperator, float > > Filter
static std::unique_ptr< Spn::Sum > create_sum(LearningData &learning_data)
Create a sum node by clustering the rows.
void update(Eigen::VectorXf &row, UpdateType update_type)
Update the SPN from the top down and adjust weights of sum nodes and the distributions on leaves.
static std::unique_ptr< Product > create_product_rdc(LearningData &learning_data, std::vector< SmallBitset > &column_candidates, std::vector< SmallBitset > &variable_candidates)
Create a product node with the given candidates (vertical clustering)