mutable
A Database System for Research and Fast Prototyping
Loading...
Searching...
No Matches
Spn.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <Eigen/Core>
4#include <iostream>
5#include <map>
6#include <memory>
8#include <set>
9#include <type_traits>
10#include <unordered_map>
11#include <vector>
12
13
14namespace m {
15
19struct Spn
20{
21 public:
23 enum LeafType {
27 };
36 };
37 enum EvalType {
41 };
44 DELETE
45 };
46
47 using Filter = std::unordered_map<unsigned, std::pair<SpnOperator, float>>;
48
49 private:
50
52 {
53 const Eigen::MatrixXf &data;
54 const Eigen::MatrixXf &normalized;
55 const Eigen::MatrixXi &null_matrix;
57 std::vector<LeafType> leaf_types;
58
60 const Eigen::MatrixXf &data,
61 const Eigen::MatrixXf &normalized,
62 const Eigen::MatrixXi &null_matrix,
64 std::vector<LeafType> leaf_types
65 )
66 : data(data)
70 , leaf_types(std::move(leaf_types))
71 { }
72 };
73
74 struct Node
75 {
76 std::size_t num_rows;
77
78 explicit Node(std::size_t num_rows) : num_rows(num_rows) { }
79
80 virtual ~Node() = default;
81 void dump() const;
82 void dump(std::ostream &out) const;
83
91 virtual std::pair<float, float> evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const = 0;
92
93 virtual void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) = 0;
94
95 virtual std::size_t estimate_number_distinct_values(unsigned id) const = 0;
96
97 virtual unsigned height() const = 0;
98 virtual unsigned breadth() const = 0;
99 virtual unsigned degree() const = 0;
100 virtual std::size_t memory_usage() const = 0;
101
102 virtual void print(std::ostream &out, std::size_t num_tabs) const = 0;
103 };
104
105 struct Sum : Node
106 {
108 {
109 std::unique_ptr<Node> child;
110 float weight;
111 Eigen::VectorXf centroid;
112
113 ChildWithWeight(std::unique_ptr<Node> child, float weight, Eigen::VectorXf centroid)
114 : child(std::move(child))
115 , weight(weight)
116 , centroid(std::move(centroid))
117 { }
118 };
119
120 std::vector<std::unique_ptr<ChildWithWeight>> children;
121
122 Sum(std::vector<std::unique_ptr<ChildWithWeight>> children, std::size_t num_rows)
123 : Node(num_rows)
124 , children(std::move(children))
125 { }
126
127 std::pair<float, float> evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override;
128
129 void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override;
130
131 std::size_t estimate_number_distinct_values(unsigned id) const override;
132
133 unsigned height() const override {
134 unsigned max_height = 0;
135 for (auto &child : children) { max_height = std::max(max_height, child->child->height()); }
136 return 1 + max_height;
137 }
138 unsigned breadth() const override {
139 unsigned breadth = 0;
140 for (auto &child : children) { breadth += child->child->breadth(); }
141 return breadth;
142 }
143 unsigned degree() const override {
144 unsigned max_degree = children.size();
145 for (auto &child : children) { max_degree = std::max(max_degree, child->child->degree()); }
146 return max_degree;
147 }
148 std::size_t memory_usage() const override {
149 std::size_t memory_used = 0;
150 memory_used += sizeof *this + children.size() * sizeof(decltype(children)::value_type);
151 for (auto &child : children) {
152 memory_used += child->child->memory_usage();
153 }
154 return memory_used;
155 }
156
157 void print(std::ostream &out, std::size_t num_tabs) const override;
158 };
159
160 struct Product : Node
161 {
163 {
164 std::unique_ptr<Node> child;
166
168 : child(std::move(child))
170 { }
171 };
172
173 std::vector<std::unique_ptr<ChildWithVariables>> children;
174
175 Product(std::vector<std::unique_ptr<ChildWithVariables>> children, std::size_t num_rows)
176 : Node(num_rows)
177 , children(std::move(children))
178 { }
179
180 std::pair<float, float> evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override;
181
182 void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override;
183
184 std::size_t estimate_number_distinct_values(unsigned id) const override;
185
186 unsigned height() const override {
187 unsigned max_height = 0;
188 for (auto &child : children) { max_height = std::max(max_height, child->child->height()); }
189 return 1 + max_height;
190 }
191 unsigned breadth() const override {
192 unsigned breadth = 0;
193 for (auto &child : children) { breadth += child->child->breadth(); }
194 return breadth;
195 }
196 unsigned degree() const override {
197 unsigned max_degree = children.size();
198 for (auto &child : children) { max_degree = std::max(max_degree, child->child->degree()); }
199 return max_degree;
200 }
201 std::size_t memory_usage() const override {
202 std::size_t memory_used = 0;
203 memory_used += sizeof *this + children.size() * sizeof(decltype(children)::value_type);
204 for (auto &child : children) {
205 memory_used += child->variables.size() * sizeof child->variables;
206 memory_used += child->child->memory_usage();
207 }
208 return memory_used;
209 }
210
211 void print(std::ostream &out, std::size_t num_tabs) const override;
212 };
213
215 {
216 struct Bin
217 {
218 float value;
221
223 : value(value)
225 { }
226
227 bool operator<(Bin &other) const { return this->value < other.value; }
228 bool operator<(float other) const { return this->value < other; }
229 };
230
231 std::vector<Bin> bins;
233
234 DiscreteLeaf(std::vector<Bin> bins, float null_probability, std::size_t num_rows)
235 : Node(num_rows)
236 , bins(std::move(bins))
238 { }
239
240 std::pair<float, float> evaluate(const Filter &bin_value, unsigned leaf_id, EvalType eval_type) const override;
241
242 void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override;
243
244 std::size_t estimate_number_distinct_values(unsigned id) const override;
245
246 unsigned height() const override { return 0; }
247 unsigned breadth() const override { return 1; }
248 unsigned degree() const override { return 0; }
249 std::size_t memory_usage() const override {
250 return sizeof *this + bins.size() * sizeof(decltype(bins)::value_type);
251 }
252 void print(std::ostream &out, std::size_t num_tabs) const override;
253 };
254
256 {
257 struct Bin
258 {
262
266 { }
267
268 bool operator<(Bin &other) const { return this->upper_bound < other.upper_bound; }
269 bool operator<(float other) const { return this->upper_bound < other; }
270 };
271
272 std::vector<Bin> bins;
276
278 std::vector<Bin> bins,
279 float lower_bound,
281 float null_probability,
282 std::size_t num_rows
283 )
284 : Node(num_rows)
285 , bins(std::move(bins))
289 { }
290
291 std::pair<float, float> evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override;
292
293 void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override;
294
295 std::size_t estimate_number_distinct_values(unsigned id) const override;
296
297 unsigned height() const override { return 0; }
298 unsigned breadth() const override { return 1; }
299 unsigned degree() const override { return 0; }
300 std::size_t memory_usage() const override {
301 return sizeof *this + bins.size() * sizeof(decltype(bins)::value_type);
302 }
303 void print(std::ostream &out, std::size_t num_tabs) const override;
304 };
305
306 std::size_t num_rows_;
307 std::unique_ptr<Node> root_;
308
309 Spn(std::size_t num_rows, std::unique_ptr<Node> root) : num_rows_(num_rows), root_(std::move(root)) { }
310
311 public:
312
314 std::size_t num_rows() const { return num_rows_; }
315
316 /*==================================================================================================================
317 * Learning
318 *================================================================================================================*/
319
320 private:
321
323 static std::unique_ptr<Spn::Product> create_product_min_slice(LearningData &learning_data);
324
326 static std::unique_ptr<Product> create_product_rdc(
327 LearningData &learning_data,
328 std::vector<SmallBitset> &column_candidates,
329 std::vector<SmallBitset> &variable_candidates
330 );
331
333 static std::unique_ptr<Spn::Sum> create_sum(LearningData &learning_data);
334
336 static std::unique_ptr<Node> learn_node(LearningData &learning_data);
337
338 public:
339
348 static Spn learn_spn(Eigen::MatrixXf &data, Eigen::MatrixXi &null_matrix, std::vector<LeafType> &leaf_types);
349
350 /*==================================================================================================================
351 * Inference
352 *================================================================================================================*/
353
354 private:
355
361 void update(Eigen::VectorXf &row, UpdateType update_type);
362
363 public:
364
367 float likelihood(const Filter &filter) const;
368
370 float upper_bound(const Filter &filter) const;
371
373 float lower_bound(const Filter &filter) const;
374
376 float expectation(unsigned attribute_id, const Filter &filter) const;
377
379 void update_row(Eigen::VectorXf &old_row, Eigen::VectorXf &updated_row);
380
382 void insert_row(Eigen::VectorXf &row);
383
385 void delete_row(Eigen::VectorXf &row);
386
388 std::size_t estimate_number_distinct_values(unsigned attribute_id) const;
389
390 unsigned height() const { return root_->height(); }
391 unsigned breadth() const { return root_->breadth(); }
392 unsigned degree() const { return root_->degree(); }
393 std::size_t memory_usage() const {
394 std::size_t memory_used = 0;
395 memory_used += sizeof root_;
396 memory_used += root_->memory_usage();
397 return memory_used;
398 }
399
400 void dump() const;
401 void dump(std::ostream &out) const;
402};
403
404}
‍mutable namespace
Definition: Backend.hpp:10
STL namespace.
Implements a small and efficient set over integers in the range of 0 to 63 (including).
Definition: ADT.hpp:26
float cumulative_probability
‍the cumulative probability of this and all predecessor bins; in the range [0;1]
Definition: Spn.hpp:261
bool operator<(float other) const
Definition: Spn.hpp:269
float upper_bound
the upper bound of this bin
Definition: Spn.hpp:259
bool operator<(Bin &other) const
Definition: Spn.hpp:268
Bin(float upper_bound, float cumulative_probability)
Definition: Spn.hpp:263
unsigned height() const override
Definition: Spn.hpp:297
std::size_t estimate_number_distinct_values(unsigned id) const override
Definition: Spn.cpp:547
std::vector< Bin > bins
bins of this leaf
Definition: Spn.hpp:272
void print(std::ostream &out, std::size_t num_tabs) const override
Definition: Spn.cpp:552
std::pair< float, float > evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override
Evaluate the SPN bottom up with a filter condition.
Definition: Spn.cpp:375
unsigned degree() const override
Definition: Spn.hpp:299
void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override
Definition: Spn.cpp:478
float null_probability
the probability of null values in this leaf
Definition: Spn.hpp:275
unsigned breadth() const override
Definition: Spn.hpp:298
float lower_bound
the lower bound of the first bin
Definition: Spn.hpp:273
std::size_t memory_usage() const override
Definition: Spn.hpp:300
ContinuousLeaf(std::vector< Bin > bins, float lower_bound, float lower_bound_probability, float null_probability, std::size_t num_rows)
Definition: Spn.hpp:277
float lower_bound_probability
probability of the lower_bound
Definition: Spn.hpp:274
bool operator<(Bin &other) const
Definition: Spn.hpp:227
float cumulative_probability
‍the cumulative probability of this and all predecessor bins; in the range [0;1]
Definition: Spn.hpp:220
Bin(float value, float cumulative_probability)
Definition: Spn.hpp:222
bool operator<(float other) const
Definition: Spn.hpp:228
float value
the value of this bin
Definition: Spn.hpp:218
unsigned breadth() const override
Definition: Spn.hpp:247
unsigned degree() const override
Definition: Spn.hpp:248
std::vector< Bin > bins
bins of this leaf
Definition: Spn.hpp:231
std::size_t memory_usage() const override
Definition: Spn.hpp:249
void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override
Definition: Spn.cpp:299
void print(std::ostream &out, std::size_t num_tabs) const override
Definition: Spn.cpp:354
std::pair< float, float > evaluate(const Filter &bin_value, unsigned leaf_id, EvalType eval_type) const override
Evaluate the SPN bottom up with a filter condition.
Definition: Spn.cpp:227
DiscreteLeaf(std::vector< Bin > bins, float null_probability, std::size_t num_rows)
Definition: Spn.hpp:234
std::size_t estimate_number_distinct_values(unsigned id) const override
Definition: Spn.cpp:349
float null_probability
the probability of null values in this leaf
Definition: Spn.hpp:232
unsigned height() const override
Definition: Spn.hpp:246
const Eigen::MatrixXf & data
Definition: Spn.hpp:53
const Eigen::MatrixXf & normalized
Definition: Spn.hpp:54
std::vector< LeafType > leaf_types
Definition: Spn.hpp:57
LearningData(const Eigen::MatrixXf &data, const Eigen::MatrixXf &normalized, const Eigen::MatrixXi &null_matrix, SmallBitset variables, std::vector< LeafType > leaf_types)
Definition: Spn.hpp:59
SmallBitset variables
Definition: Spn.hpp:56
const Eigen::MatrixXi & null_matrix
Definition: Spn.hpp:55
Node(std::size_t num_rows)
Definition: Spn.hpp:78
virtual unsigned degree() const =0
void dump() const
Definition: Spn.cpp:92
virtual void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type)=0
virtual void print(std::ostream &out, std::size_t num_tabs) const =0
virtual std::size_t estimate_number_distinct_values(unsigned id) const =0
virtual std::pair< float, float > evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const =0
Evaluate the SPN bottom up with a filter condition.
virtual unsigned breadth() const =0
virtual std::size_t memory_usage() const =0
std::size_t num_rows
Definition: Spn.hpp:76
virtual ~Node()=default
virtual unsigned height() const =0
ChildWithVariables(std::unique_ptr< Node > child, SmallBitset variables)
Definition: Spn.hpp:167
std::unique_ptr< Node > child
a child of the Product node
Definition: Spn.hpp:164
SmallBitset variables
the set of variables(attributes), that are in this child
Definition: Spn.hpp:165
unsigned breadth() const override
Definition: Spn.hpp:191
unsigned height() const override
Definition: Spn.hpp:186
std::pair< float, float > evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override
Evaluate the SPN bottom up with a filter condition.
Definition: Spn.cpp:163
void print(std::ostream &out, std::size_t num_tabs) const override
Definition: Spn.cpp:210
void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override
Definition: Spn.cpp:180
Product(std::vector< std::unique_ptr< ChildWithVariables > > children, std::size_t num_rows)
Definition: Spn.hpp:175
unsigned degree() const override
Definition: Spn.hpp:196
std::size_t estimate_number_distinct_values(unsigned id) const override
Definition: Spn.cpp:199
std::size_t memory_usage() const override
Definition: Spn.hpp:201
std::vector< std::unique_ptr< ChildWithVariables > > children
Definition: Spn.hpp:173
float weight
weight of a child of a sum node
Definition: Spn.hpp:110
std::unique_ptr< Node > child
Definition: Spn.hpp:109
Eigen::VectorXf centroid
centroid of this child according to kmeans cluster
Definition: Spn.hpp:111
ChildWithWeight(std::unique_ptr< Node > child, float weight, Eigen::VectorXf centroid)
Definition: Spn.hpp:113
void update(Eigen::VectorXf &row, SmallBitset variables, UpdateType update_type) override
Definition: Spn.cpp:113
std::vector< std::unique_ptr< ChildWithWeight > > children
Definition: Spn.hpp:120
unsigned breadth() const override
Definition: Spn.hpp:138
Sum(std::vector< std::unique_ptr< ChildWithWeight > > children, std::size_t num_rows)
Definition: Spn.hpp:122
unsigned degree() const override
Definition: Spn.hpp:143
std::size_t estimate_number_distinct_values(unsigned id) const override
Definition: Spn.cpp:138
std::size_t memory_usage() const override
Definition: Spn.hpp:148
void print(std::ostream &out, std::size_t num_tabs) const override
Definition: Spn.cpp:148
std::pair< float, float > evaluate(const Filter &filter, unsigned leaf_id, EvalType eval_type) const override
Evaluate the SPN bottom up with a filter condition.
Definition: Spn.cpp:100
unsigned height() const override
Definition: Spn.hpp:133
Tree structure for Sum Product Networks.
Definition: Spn.hpp:20
static Spn learn_spn(Eigen::MatrixXf &data, Eigen::MatrixXi &null_matrix, std::vector< LeafType > &leaf_types)
Learn an SPN over the given data.
Definition: Spn.cpp:851
void dump() const
Definition: Spn.cpp:951
float lower_bound(const Filter &filter) const
Compute the lower bound probability for continuous domains.
Definition: Spn.cpp:908
EvalType
Definition: Spn.hpp:37
@ APPROXIMATE
Definition: Spn.hpp:38
@ LOWER_BOUND
Definition: Spn.hpp:40
@ UPPER_BOUND
Definition: Spn.hpp:39
void update_row(Eigen::VectorXf &old_row, Eigen::VectorXf &updated_row)
Update the SPN with the given row.
Definition: Spn.cpp:928
float likelihood(const Filter &filter) const
Compute the likelihood of the given filter predicates given by a map from attribute to the respective...
Definition: Spn.cpp:898
unsigned degree() const
Definition: Spn.hpp:392
void delete_row(Eigen::VectorXf &row)
Delete the given row from the SPN.
Definition: Spn.cpp:940
float upper_bound(const Filter &filter) const
Compute the upper bound probability for continuous domains.
Definition: Spn.cpp:903
std::size_t estimate_number_distinct_values(unsigned attribute_id) const
Estimate the number of distinct values of the given attribute.
Definition: Spn.cpp:946
float expectation(unsigned attribute_id, const Filter &filter) const
Compute the expectation of the given attribute.
Definition: Spn.cpp:913
SpnOperator
Definition: Spn.hpp:28
@ GREATER
Definition: Spn.hpp:32
@ EQUAL
Definition: Spn.hpp:29
@ LESS
Definition: Spn.hpp:30
@ GREATER_EQUAL
Definition: Spn.hpp:33
@ LESS_EQUAL
Definition: Spn.hpp:31
@ IS_NULL
Definition: Spn.hpp:34
@ EXPECTATION
Definition: Spn.hpp:35
UpdateType
Definition: Spn.hpp:42
@ DELETE
Definition: Spn.hpp:44
@ INSERT
Definition: Spn.hpp:43
unsigned height() const
Definition: Spn.hpp:390
static std::unique_ptr< Spn::Product > create_product_min_slice(LearningData &learning_data)
Create a product node by splitting all columns.
Definition: Spn.cpp:577
std::size_t num_rows() const
returns the number of rows in the SPN.
Definition: Spn.hpp:314
void insert_row(Eigen::VectorXf &row)
Insert the given row into the SPN.
Definition: Spn.cpp:934
static std::unique_ptr< Node > learn_node(LearningData &learning_data)
Recursively learns the nodes of an SPN.
Definition: Spn.cpp:751
std::unique_ptr< Node > root_
Definition: Spn.hpp:307
std::unordered_map< unsigned, std::pair< SpnOperator, float > > Filter
Definition: Spn.hpp:47
Spn(std::size_t num_rows, std::unique_ptr< Node > root)
Definition: Spn.hpp:309
unsigned breadth() const
Definition: Spn.hpp:391
static std::unique_ptr< Spn::Sum > create_sum(LearningData &learning_data)
Create a sum node by clustering the rows.
Definition: Spn.cpp:633
std::size_t num_rows_
Definition: Spn.hpp:306
void update(Eigen::VectorXf &row, UpdateType update_type)
Update the SPN from the top down and adjust weights of sum nodes and the distributions on leaves.
Definition: Spn.cpp:892
std::size_t memory_usage() const
Definition: Spn.hpp:393
LeafType
The different types of leaves for an attribute.
Definition: Spn.hpp:23
@ CONTINUOUS
Definition: Spn.hpp:26
@ AUTO
Definition: Spn.hpp:24
@ DISCRETE
Definition: Spn.hpp:25
static std::unique_ptr< Product > create_product_rdc(LearningData &learning_data, std::vector< SmallBitset > &column_candidates, std::vector< SmallBitset > &variable_candidates)
Create a product node with the given candidates (vertical clustering)
Definition: Spn.cpp:602