mutable
A Database System for Research and Fast Prototyping
Loading...
Searching...
No Matches
CardinalityEstimator.cpp
Go to the documentation of this file.
2
5#include "util/Spn.hpp"
6#include <algorithm>
7#include <cstddef>
8#include <cstdlib>
9#include <cstring>
10#include <filesystem>
12#include <mutable/IR/CNF.hpp>
16#include <mutable/Options.hpp>
18#include <mutable/util/Pool.hpp>
19#include <nlohmann/json.hpp>
20
21
22using namespace m;
23
24
25namespace {
26
27namespace options {
28
29std::filesystem::path injected_cardinalities_file;
30
31}
32
33}
34
35/*======================================================================================================================
36 * CardinalityEstimator
37 *====================================================================================================================*/
38
40
42
44{
45 throw data_model_exception("predicting the number of distinct values is not supported by this data model.");
46};
47
49void CardinalityEstimator::dump(std::ostream &out) const
50{
51 print(out);
52 out << std::endl;
53}
54
55void CardinalityEstimator::dump() const { dump(std::cerr); }
57
58
59/*======================================================================================================================
60 * CartesianProductEstimator
61 *====================================================================================================================*/
62
63std::unique_ptr<DataModel> CartesianProductEstimator::empty_model() const
64{
65 auto model = std::make_unique<CartesianProductDataModel>();
66 model->size = 0;
67 return model;
68}
69
70std::unique_ptr<DataModel> CartesianProductEstimator::estimate_scan(const QueryGraph &G, Subproblem P) const
71{
72 M_insist(P.size() == 1, "Subproblem must identify exactly one DataSource");
73 auto idx = *P.begin();
74 auto &BT = as<const BaseTable>(*G.sources()[idx]);
75 auto model = std::make_unique<CartesianProductDataModel>();
76 model->size = BT.table().store().num_rows();
77 return model;
78}
79
80std::unique_ptr<DataModel>
82{
83 /* This model cannot estimate the effects of applying a filter. */
84 auto &data = as<const CartesianProductDataModel>(_data);
85 return std::make_unique<CartesianProductDataModel>(data); // copy
86}
87
88std::unique_ptr<DataModel>
89CartesianProductEstimator::estimate_limit(const QueryGraph&, const DataModel &_data, std::size_t limit,
90 std::size_t offset) const
91{
92 auto data = as<const CartesianProductDataModel>(_data);
93 const std::size_t remaining = offset > data.size ? 0UL : data.size - offset;
94 auto model = std::make_unique<CartesianProductDataModel>();
95 model->size = std::min(remaining, limit);
96 return model;
97}
98
99std::unique_ptr<DataModel>
101 const std::vector<group_type>&) const
102{
103 auto &data = as<const CartesianProductDataModel>(_data);
104 auto model = std::make_unique<CartesianProductDataModel>();
105 model->size = data.size; // this model cannot estimate the effects of grouping
106 return model;
107}
108
109std::unique_ptr<DataModel>
111 const cnf::CNF&) const
112{
113 auto left = as<const CartesianProductDataModel>(_left);
114 auto right = as<const CartesianProductDataModel>(_right);
115 auto model = std::make_unique<CartesianProductDataModel>();
116 model->size = left.size * right.size; // this model cannot estimate the effects of a join condition
117 return model;
118}
119
120template<typename PlanTable>
121std::unique_ptr<DataModel>
123 const cnf::CNF&) const
124{
125 M_insist(not to_join.empty());
126 auto model = std::make_unique<CartesianProductDataModel>();
127 model->size = 1UL;
128 for (auto it = to_join.begin(); it != to_join.end(); ++it)
129 model->size *= as<const CartesianProductDataModel>(*PT[it.as_set()].model).size;
130 return model;
131}
132
133template
134std::unique_ptr<DataModel>
136 Subproblem, const cnf::CNF&) const;
137template
138std::unique_ptr<DataModel>
140 Subproblem, const cnf::CNF&) const;
141
143{
144 return as<const CartesianProductDataModel>(data).size;
145}
146
148void CartesianProductEstimator::print(std::ostream &out) const
149{
150 out << "CartesianProductEstimator - returns size of the Cartesian product of the given subproblems";
151}
153
154
155/*======================================================================================================================
156 * InjectionCardinalityEstimator
157 *====================================================================================================================*/
158
159/*----- Constructors -------------------------------------------------------------------------------------------------*/
160
162 : fallback_(name_of_database)
163{
164 Diagnostic diag(Options::Get().has_color, std::cout, std::cerr);
165 Position pos("InjectionCardinalityEstimator");
166
167 if (options::injected_cardinalities_file.empty()) {
168 std::cout << "No injection file was passed.\n";
169 } else {
170 std::ifstream in(options::injected_cardinalities_file);
171 if (in) {
172 read_json(diag, in, name_of_database);
173 } else {
174 diag.w(pos) << "Could not open file " << options::injected_cardinalities_file << ".\n"
175 << "A dummy estimator will be used to do estimations.\n";
176 }
177 }
178}
179
181 std::istream &in)
182 : fallback_(name_of_database)
183{
184 read_json(diag, in, name_of_database);
185}
186
188 const ThreadSafePooledString &name_of_database)
189{
190 Catalog &C = Catalog::Get();
191 Position pos("InjectionCardinalityEstimator");
192 std::string prev_relation;
193
194 using json = nlohmann::json;
195 json cardinalities;
196 try {
197 in >> cardinalities;
198 } catch (json::parse_error parse_error) {
199 diag.w(pos) << "The file could not be parsed as json. Parser error output:\n"
200 << parse_error.what() << "\n"
201 << "A dummy estimator will be used to do estimations.\n";
202 return;
203 }
204 json *database_entry;
205 try {
206 database_entry = &cardinalities.at(*name_of_database); //throws if key does not exist
207 } catch (json::out_of_range &out_of_range) {
208 diag.w(pos) << "No entry for the db " << name_of_database << " in the file.\n"
209 << "A dummy estimator will be used to do estimations.\n";
210 return;
211 }
212 cardinality_table_.reserve(database_entry->size());
213 std::vector<std::string> names;
214 for (auto &subproblem_entry : *database_entry) {
215 json* relations_array;
216 json* size;
217 try {
218 relations_array = &subproblem_entry.at("relations");
219 size = &subproblem_entry.at("size");
220 } catch (json::exception &exception) {
221 diag.w(pos) << "The entry " << subproblem_entry << " for the db \"" << name_of_database << "\""
222 << " does not have the required form of {\"relations\": ..., \"size\": ... } "
223 << "and will thus be ignored.\n";
224 continue;
225 }
226
227 names.clear();
228 for (auto it = relations_array->begin(); it != relations_array->end(); ++it)
229 names.emplace_back(it->get<std::string>());
230 std::sort(names.begin(), names.end());
231
232 buf_.clear();
233 for (auto it = names.begin(); it != names.end(); ++it) {
234 if (it != names.begin())
235 buf_.emplace_back('$');
236 buf_append(*it);
237 }
238 buf_.emplace_back(0);
240 auto res = cardinality_table_.emplace(std::move(str), *size);
241 M_insist(res.second, "insertion must not fail as we do not allow for duplicates in the input file");
242 }
243}
244
245/*----- Model calculation --------------------------------------------------------------------------------------------*/
246
247std::unique_ptr<DataModel> InjectionCardinalityEstimator::empty_model() const
248{
249 return std::make_unique<InjectionCardinalityDataModel>(Subproblem(), 0);
250}
251
252std::unique_ptr<DataModel> InjectionCardinalityEstimator::estimate_scan(const QueryGraph &G, Subproblem P) const
253{
254 M_insist(P.size() == 1);
255 const auto idx = *P.begin();
256 auto &DS = *G.sources()[idx];
257
258 if (auto it = cardinality_table_.find(DS.name().assert_not_none()); it != cardinality_table_.end()) {
259 return std::make_unique<InjectionCardinalityDataModel>(P, it->second);
260 } else {
261 /* no match, fall back */
262 auto fallback_model = fallback_.estimate_scan(G, P);
263 return std::make_unique<InjectionCardinalityDataModel>(P, fallback_.predict_cardinality(*fallback_model));
264 }
265}
266
267std::unique_ptr<DataModel>
269{
270 /* This model cannot estimate the effects of applying a filter. */
271 auto &data = as<const InjectionCardinalityDataModel>(_data);
272 return std::make_unique<InjectionCardinalityDataModel>(data); // copy
273}
274
275std::unique_ptr<DataModel>
277 std::size_t offset) const
278{
279 auto &data = as<const InjectionCardinalityDataModel>(_data);
280 const std::size_t remaining = offset > data.size_ ? 0UL : data.size_ - offset;
281 return std::make_unique<InjectionCardinalityDataModel>(data.subproblem_, std::min(remaining, limit));
282}
283
284std::unique_ptr<DataModel>
286 const std::vector<group_type> &exprs) const
287{
288 auto &data = as<const InjectionCardinalityDataModel>(_data);
289
290 if (exprs.empty())
291 return std::make_unique<InjectionCardinalityDataModel>(data.subproblem_, 1); // single group
292
293 /* Combine grouping keys into an identifier. */
294 oss_.str("");
295 oss_ << "g";
296 for (auto [grp, alias] : exprs) {
297 oss_ << '#';
298 if (alias.has_value())
299 oss_ << alias;
300 else
301 oss_ << grp.get();
302 }
303 ThreadSafePooledString id = Catalog::Get().pool(oss_.str().c_str());
304
305 if (auto it = cardinality_table_.find(id); it != cardinality_table_.end()) {
306 /* Clamp injected cardinality to at most the cardinality of the grouping's child since it cannot produce more
307 * tuples than it receives. */
308 return std::make_unique<InjectionCardinalityDataModel>(data.subproblem_, std::min(it->second, data.size_));
309 } else {
310 /* This model cannot estimate the effects of grouping. */
311 return std::make_unique<InjectionCardinalityDataModel>(data); // copy
312 }
313}
314
315std::unique_ptr<DataModel>
317 const cnf::CNF &condition) const
318{
319 auto &left = as<const InjectionCardinalityDataModel>(_left);
320 auto &right = as<const InjectionCardinalityDataModel>(_right);
321
322 const Subproblem subproblem = left.subproblem_ | right.subproblem_;
323 ThreadSafePooledString id = make_identifier(G, subproblem);
324
325 /* Lookup cardinality in table. */
326 if (auto it = cardinality_table_.find(id); it != cardinality_table_.end()) {
327 /* Clamp injected cardinality to at most the cardinality of the cartesian product of the join's children
328 * since it cannot produce more tuples than that. */
329 const std::size_t max_cardinality = left.size_ * right.size_;
330 return std::make_unique<InjectionCardinalityDataModel>(subproblem, std::min(it->second, max_cardinality));
331 } else {
332 /* Fallback to CartesianProductEstimator. */
333 if (not Options::Get().quiet)
334 std::cerr << "warning: failed to estimate the join of " << left.subproblem_ << " and " << right.subproblem_
335 << '\n';
336 auto left_fallback = std::make_unique<CartesianProductEstimator::CartesianProductDataModel>();
337 left_fallback->size = left.size_;
338 auto right_fallback = std::make_unique<CartesianProductEstimator::CartesianProductDataModel>();
339 right_fallback->size = right.size_;
340 auto fallback_model = fallback_.estimate_join(G, *left_fallback, *right_fallback, condition);
341 return std::make_unique<InjectionCardinalityDataModel>(subproblem,
342 fallback_.predict_cardinality(*fallback_model));
343 }
344}
345
346template<typename PlanTable>
347std::unique_ptr<DataModel>
349 Subproblem to_join, const cnf::CNF&) const
350{
352 if (auto it = cardinality_table_.find(id); it != cardinality_table_.end()) {
353 /* Clamp injected cardinality to at most the cardinality of the cartesian product of the join's children
354 * since it cannot produce more tuples than that. */
355 std::size_t max_cardinality = 1;
356 for (auto it = to_join.begin(); it != to_join.end(); ++it)
357 max_cardinality *= as<const InjectionCardinalityDataModel>(*PT[it.as_set()].model).size_;
358 return std::make_unique<InjectionCardinalityDataModel>(to_join, std::min(it->second, max_cardinality));
359 } else {
360 /* Fallback to cartesian product. */
361 if (not Options::Get().quiet)
362 std::cerr << "warning: failed to estimate the join of all data sources in " << to_join << '\n';
363 auto ds_it = to_join.begin();
364 std::size_t size = as<const InjectionCardinalityDataModel>(*PT[ds_it.as_set()].model).size_;
365 for (; ds_it != to_join.end(); ++ds_it)
366 size *= as<const InjectionCardinalityDataModel>(*PT[ds_it.as_set()].model).size_;
367 return std::make_unique<InjectionCardinalityDataModel>(to_join, size);
368 }
369}
370
371template
372std::unique_ptr<DataModel>
374 Subproblem, const cnf::CNF&) const;
375
376template
377std::unique_ptr<DataModel>
379 Subproblem, const cnf::CNF&) const;
380
382{
383 return as<const InjectionCardinalityDataModel>(data).size_;
384}
385
387void InjectionCardinalityEstimator::print(std::ostream &out) const
388{
389 constexpr uint32_t max_rows_printed = 100;
390 std::size_t sub_len = 13;
391 for (auto &entry : cardinality_table_)
392 sub_len = std::max(sub_len, strlen(*entry.first));
393
394 out << std::left << "InjectionCardinalityEstimator\n"
395 << std::setw(sub_len) << "Subproblem" << "Size" << "\n" << std::right;
396
397 /* ------- Print maximum max_rows_printed rows of the cardinality_table_ */
398 uint32_t counter = 0;
399 for (auto &entry : cardinality_table_) {
400 if (counter >= max_rows_printed) break;
401 out << std::left << std::setw(sub_len) << entry.first << entry.second << "\n";
402 counter++;
403 }
404}
406
408{
409 auto &C = Catalog::Get();
410 static thread_local std::vector<ThreadSafePooledString> names;
411 names.clear();
412 for (auto id : S)
413 names.emplace_back(G.sources()[id]->name());
414 std::sort(names.begin(), names.end(), [](auto lhs, auto rhs){ return strcmp(*lhs, *rhs) < 0; });
415
416 buf_.clear();
417 for (auto it = names.begin(); it != names.end(); ++it) {
418 if (it != names.begin())
419 buf_.emplace_back('$');
420 buf_append(**it);
421 }
422
423 buf_.emplace_back(0);
424 return C.pool(buf_view());
425}
426
427
428/*======================================================================================================================
429 * SpnEstimator
430 *====================================================================================================================*/
431
432namespace {
433
435struct FilterTranslator : ast::ConstASTExprVisitor {
436 Catalog &C = Catalog::Get();
437 ThreadSafePooledString attribute;
438 float value;
440
441 FilterTranslator() : attribute(C.pool("")), value(0), op(Spn::EQUAL) { }
442
443 using ConstASTExprVisitor::operator();
444
445 void operator()(const ast::Designator &designator) { attribute = designator.attr_name.text.assert_not_none(); }
446
447 void operator()(const ast::Constant &constant) {
448 auto val = Interpreter::eval(constant);
449
451 [&val, this](const Numeric &numeric) {
452 switch (numeric.kind) {
453 case Numeric::N_Int:
454 value = float(val.as_i());
455 break;
456 case Numeric::N_Float:
457 value = val.as_f();
458 break;
459 case Numeric::N_Decimal:
460 value = float(val.as_d());
461 break;
462 }
463 },
464 [this](const NoneType&) { op = Spn::IS_NULL; },
465 [](auto&&) { M_unreachable("Unsupported type."); },
466 }, *constant.type());
467 }
468
469 void operator()(const ast::BinaryExpr &binary_expr) {
470 switch (binary_expr.op().type) {
471 case TK_EQUAL:
472 op = Spn::EQUAL;
473 break;
474 case TK_LESS:
475 op = Spn::LESS;
476 break;
477 case TK_LESS_EQUAL:
479 break;
480 case TK_GREATER:
482 break;
483 case TK_GREATER_EQUAL:
485 break;
486 default:
487 M_unreachable("Operator can't be handled");
488 }
489
490 (*this)(*binary_expr.lhs);
491 (*this)(*binary_expr.rhs);
492 }
493
494 void operator()(const ast::ErrorExpr&) { /* nothing to be done */ }
495 void operator()(const ast::FnApplicationExpr &) { /* nothing to be done */ }
496 void operator()(const ast::UnaryExpr &) { /* nothing to be done */ }
497 void operator()(const ast::QueryExpr &) { /* nothing to be done */ }
498};
499
501struct JoinTranslator : ast::ConstASTExprVisitor {
502
503 std::vector<std::pair<ThreadSafePooledString, ThreadSafePooledString>> join_designator;
504
505 using ConstASTExprVisitor::operator();
506
507 void operator()(const ast::Designator &designator) {
508 join_designator.emplace_back(designator.table_name.text, designator.attr_name.text);
509 }
510
511 void operator()(const ast::BinaryExpr &binary_expr) {
512
513 if (binary_expr.op().type != m::TK_EQUAL) { M_unreachable("Operator can't be handled"); }
514
515 (*this)(*binary_expr.lhs);
516 (*this)(*binary_expr.rhs);
517 }
518
519 void operator()(const ast::Constant &) { /* nothing to be done */ }
520 void operator()(const ast::ErrorExpr&) { /* nothing to be done */ }
521 void operator()(const ast::FnApplicationExpr &) { /* nothing to be done */ }
522 void operator()(const ast::UnaryExpr &) { /* nothing to be done */ }
523 void operator()(const ast::QueryExpr &) { /* nothing to be done */ }
524};
525
526}
527
529{
530 for (auto &e : table_to_spn_)
531 delete e.second;
532}
533
535
537{
538 table_to_spn_.emplace(
539 name_of_table,
541 );
542}
543
544std::pair<unsigned, bool> SpnEstimator::find_spn_id(const SpnDataModel &data, SpnJoin &join)
545{
546 /* we only have a single spn */
547 ThreadSafePooledString table_name = data.spns_.begin()->first;
548 auto &attr_to_id = data.spns_.begin()->second.get().get_attribute_to_id();
549
550 unsigned spn_id = 0;
551 bool is_primary_key = false;
552
553 /* check which identifier of the join corresponds to the table of the spn, get the corresponding attribute id */
554 auto find_iter = table_name == join.first.first ?
555 attr_to_id.find(join.first.second) : attr_to_id.find(join.second.second);
556
557 if (find_iter != attr_to_id.end()) { spn_id = find_iter->second; }
558 else { is_primary_key = true; }
559
560 return {spn_id, is_primary_key};
561}
562
563std::size_t SpnEstimator::max_frequency(const SpnDataModel &data, SpnJoin &join)
564{
565 auto [spn_id, is_primary_key] = find_spn_id(data, join);
566
567 /* maximum frequency is only computed on data models which only have one Spn */
568 const SpnWrapper &spn = data.spns_.begin()->second.get();
569
570 return is_primary_key ? 1 : data.num_rows_ / spn.estimate_number_distinct_values(spn_id);
571}
572
573std::size_t SpnEstimator::max_frequency(const SpnDataModel &data, const ThreadSafePooledString &attribute)
574{
575 /* maximum frequency is only computed on data models which only have one Spn */
576 const SpnWrapper &spn = data.spns_.begin()->second.get();
577 auto &attr_to_id = spn.get_attribute_to_id();
578 auto find_iter = attr_to_id.find(attribute);
579
580 return find_iter == attr_to_id.end() ? 1 : data.num_rows_ / spn.estimate_number_distinct_values(find_iter->second);
581}
582
583/*----- Model calculation --------------------------------------------------------------------------------------------*/
584
585std::unique_ptr<DataModel> SpnEstimator::empty_model() const
586{
587 return std::make_unique<SpnDataModel>(table_spn_map(), 0);
588}
589
590std::unique_ptr<DataModel> SpnEstimator::estimate_scan(const QueryGraph &G, Subproblem P) const
591{
592 M_insist(P.size() == 1);
593 const auto idx = *P.begin();
594 auto &BT = as<const BaseTable>(*G.sources()[idx]);
595 /* get the Spn corresponding for the table to scan */
596 if (auto it = table_to_spn_.find(BT.name().assert_not_none()); it != table_to_spn_.end()) {
597 table_spn_map spns;
598 const SpnWrapper &spn = *it->second;
599 spns.emplace(BT.name(), spn);
600 return std::make_unique<SpnDataModel>(std::move(spns), spn.num_rows());
601 } else {
602 throw data_model_exception("Table does not exist.");
603 }
604}
605
606std::unique_ptr<DataModel>
607SpnEstimator::estimate_filter(const QueryGraph&, const DataModel &_data, const cnf::CNF &filter) const
608{
609 auto &data = as<const SpnDataModel>(_data);
610 M_insist(data.spns_.size() == 1);
611 auto new_data = std::make_unique<SpnDataModel>(data);
612 auto &spn = new_data->spns_.begin()->second.get();
613 auto &attribute_to_id = spn.get_attribute_to_id();
614
615 Spn::Filter translated_filter;
616 FilterTranslator ft;
617
618 /* only consider clauses with one element, since Spns cannot estimate disjunctions */
619 for (auto &clause : filter) {
620 M_insist(clause.size() == 1);
621 ft(*clause[0]);
622 unsigned spn_id;
623
624 if (auto it = attribute_to_id.find(ft.attribute); it != attribute_to_id.end()) {
625 spn_id = it->second;
626 } else {
627 throw data_model_exception("Attribute does not exist.");
628 }
629
630 translated_filter.emplace(spn_id, std::make_pair(ft.op, ft.value));
631 }
632
633 /* Save number of rows in the newly constructed data model with the filter applied */
634 new_data->num_rows_ = float(new_data->num_rows_) * spn.likelihood(translated_filter);
635 return new_data;
636}
637
638std::unique_ptr<DataModel>
639SpnEstimator::estimate_limit(const QueryGraph&, const DataModel &data, std::size_t limit, std::size_t) const
640{
641 auto model = std::make_unique<SpnDataModel>(as<const SpnDataModel>(data));
642 model->num_rows_ = std::min(model->num_rows_, limit);
643 return model;
644}
645
646std::unique_ptr<DataModel> SpnEstimator::estimate_grouping(const QueryGraph&, const DataModel &data,
647 const std::vector<group_type> &groups) const
648{
649 auto model = std::make_unique<SpnDataModel>(as<const SpnDataModel>(data));
650 std::size_t num_rows = 1;
651 for (auto [grp, alias] : groups) {
652 auto designator = cast<const ast::Designator>(&grp.get());
653 if (not designator)
654 throw data_model_exception("SpnEstimator only supports Designators and no composed expressions");
655 auto spn_it = model->spns_.find(designator->table_name.text.assert_not_none());
656 if (spn_it == model->spns_.end())
657 throw data_model_exception("Could not find table for grouping.");
658
659 auto &spn = spn_it->second.get();
660 auto &attr_to_id = spn.get_attribute_to_id();
661 if (auto attr_it = attr_to_id.find(designator->attr_name.text.assert_not_none()); attr_it != attr_to_id.end()) {
662 num_rows *= spn.estimate_number_distinct_values(attr_it->second);
663 } else {
664 num_rows *= spn.num_rows(); // if attribute is primary key, distinct values = num rows
665 }
666 }
667 model->num_rows_ = num_rows;
668 return model;
669}
670
671std::unique_ptr<DataModel>
672SpnEstimator::estimate_join(const QueryGraph&, const DataModel &_left, const DataModel &_right,
673 const cnf::CNF &condition) const
674{
675 auto &left = as<const SpnDataModel>(_left);
676 auto &right = as<const SpnDataModel>(_right);
677
678 auto new_left = std::make_unique<SpnDataModel>(left);
679 auto new_right = std::make_unique<SpnDataModel>(right);
680
681 JoinTranslator jt;
682
683 if (not condition.empty()) {
684 /* only consider single equi join */
685 jt(*condition[0][0]);
686 auto first_identifier = std::make_pair(jt.join_designator[0].first, jt.join_designator[0].second);
687 auto second_identifier = std::make_pair(jt.join_designator[1].first, jt.join_designator[1].second);
688 SpnJoin join = std::make_pair(first_identifier, second_identifier);
689
690 /* if a data model is only responsible for one table (one spn) add the maximum frequency of the value
691 * of the joined attribute */
692 if (left.spns_.size() == 1) { new_left->max_frequencies_.emplace_back(max_frequency(left, join)); }
693 if (right.spns_.size() == 1) { new_right->max_frequencies_.emplace_back(max_frequency(right, join)); }
694
695 /* compute the estimated cardinality of the join via distinct count estimates.
696 * See http://www.cidrdb.org/cidr2021/papers/cidr2021_paper01.pdf */
697 std::size_t mf_left = std::accumulate(
698 new_left->max_frequencies_.begin(), new_left->max_frequencies_.end(), 1, std::multiplies<>());
699
700 std::size_t mf_right = std::accumulate(
701 new_right->max_frequencies_.begin(), new_right->max_frequencies_.end(), 1, std::multiplies<>());
702
703 std::size_t left_clause = new_left->num_rows_ / mf_left;
704 std::size_t right_clause = new_right->num_rows_ / mf_right;
705
706 std::size_t num_rows_join = std::min<std::size_t>(left_clause, right_clause) * mf_left * mf_right;
707
708 new_left->num_rows_ = num_rows_join;
709 } else {
710 /* compute cartesian product since there is no join condition */
711 if (left.spns_.size() == 1) { new_left->max_frequencies_.emplace_back(left.num_rows_); }
712 if (right.spns_.size() == 1) { new_right->max_frequencies_.emplace_back(right.num_rows_); }
713
714 new_left->num_rows_ = left.num_rows_ * right.num_rows_;
715 }
716
717 /* copy data from new_right to new_left to collect all information in one DataModel */
718 new_left->spns_.insert(new_right->spns_.begin(), new_right->spns_.end());
719 new_left->max_frequencies_.insert(
720 new_left->max_frequencies_.end(), new_right->max_frequencies_.begin(), new_right->max_frequencies_.end());
721
722 return new_left;
723}
724
725template<typename PlanTable>
726std::unique_ptr<DataModel>
728 const cnf::CNF &condition) const
729{
730 M_insist(not to_join.empty());
731 /* compute cartesian product */
732 if (condition.empty()) {
733 auto model = std::make_unique<SpnDataModel>();
734 model->num_rows_ = 1UL;
735 for (auto it = to_join.begin(); it != to_join.end(); ++it)
736 model->num_rows_ *= as<const SpnDataModel>(*PT[it.as_set()].model).num_rows_;
737 return model;
738 }
739
740 /* get all attributes to join on */
741 JoinTranslator jt;
742 std::unordered_map<ThreadSafePooledString, ThreadSafePooledString> table_to_attribute;
743 for (auto clause : condition) {
744 jt(*clause[0]);
745 table_to_attribute.emplace(jt.join_designator[0].first, jt.join_designator[0].second);
746 table_to_attribute.emplace(jt.join_designator[1].first, jt.join_designator[1].second);
747 }
748
749 /* get first model to join */
750 SpnDataModel final_model = as<const SpnDataModel>(*PT[to_join.begin().as_set()].model);
751 ThreadSafePooledString first_table_name = final_model.spns_.begin()->first;
752
753 /* if there is a join condition on this model, get the maximum frequency of the attribute */
754 if (auto find_iter = table_to_attribute.find(first_table_name); find_iter != table_to_attribute.end()) {
755 final_model.max_frequencies_.emplace_back(max_frequency(final_model, find_iter->second));
756 }
757 /* else, maximum frequency is set as the number of rows (to compute the cartesian product) */
758 else {
759 final_model.max_frequencies_.emplace_back(final_model.spns_.begin()->second.get().num_rows());
760 }
761
762 /* iteratively add the next model to join via distinct count estimates */
763 for (auto it = ++to_join.begin(); it != to_join.end(); it++) {
764 SpnDataModel model = as<const SpnDataModel>(*PT[it.as_set()].model);
765 ThreadSafePooledString table_name = model.spns_.begin()->first;
766
767 if (auto find_iter = table_to_attribute.find(table_name); find_iter != table_to_attribute.end()) {
768 model.max_frequencies_.emplace_back(max_frequency(model, find_iter->second));
769 } else {
770 model.max_frequencies_.emplace_back(model.spns_.begin()->second.get().num_rows());
771 }
772
773 std::size_t mf_left = std::accumulate(
774 final_model.max_frequencies_.begin(), final_model.max_frequencies_.end(), 1, std::multiplies<>());
775 std::size_t mf_right = model.max_frequencies_[0];
776
777 std::size_t left_clause = final_model.num_rows_ / mf_left;
778 std::size_t right_clause = model.num_rows_ / mf_right;
779
780 std::size_t num_rows_join = std::min<std::size_t>(left_clause, right_clause) * mf_left * mf_right;
781
782 final_model.num_rows_ = num_rows_join;
783
784 /* copy data from model to final_model to collect all information in one DataModel */
785 final_model.spns_.emplace(*model.spns_.begin());
786 final_model.max_frequencies_.emplace_back(model.max_frequencies_[0]);
787 }
788
789 return std::make_unique<SpnDataModel>(final_model);
790}
791
792std::size_t SpnEstimator::predict_cardinality(const DataModel &_data) const
793{
794 auto &data = as<const SpnDataModel>(_data);
795 return data.num_rows_;
796}
797
798void SpnEstimator::print(std::ostream&) const { }
799
800
801#define LIST_CE(X) \
802 X(CartesianProductEstimator, "CartesianProduct", "estimates cardinalities as Cartesian product") \
803 X(InjectionCardinalityEstimator, "Injected", "estimates cardinalities based on a JSON file") \
804 X(SpnEstimator, "Spn", "estimates cardinalities based on Sum-Product Networks")
805
806#define INSTANTIATE(TYPE, _1, _2) \
807 template std::unique_ptr<DataModel> TYPE::operator()(estimate_join_all_tag, PlanTableSmallOrDense &&PT, \
808 const QueryGraph &G, Subproblem to_join, \
809 const cnf::CNF &condition) const; \
810 template std::unique_ptr<DataModel> TYPE::operator()(estimate_join_all_tag, PlanTableLargeAndSparse &&PT, \
811 const QueryGraph &G, Subproblem to_join, \
812 const cnf::CNF &condition) const;
814#undef INSTANTIATE
815
816__attribute__((constructor(202)))
817static void register_cardinality_estimators()
818{
819 Catalog &C = Catalog::Get();
820
821#define REGISTER(TYPE, NAME, DESCRIPTION) \
822 C.register_cardinality_estimator<TYPE>(C.pool(NAME), DESCRIPTION);
824#undef REGISTER
825
826 C.arg_parser().add<bool>(
827 /* group= */ "Cardinality estimation",
828 /* short= */ nullptr,
829 /* long= */ "--show-cardinality-file-example",
830 /* description= */ "show an example of an input JSON file for cardinality injection",
831 [] (bool) {
832 std::cout << "\
833Example for injected cardinalities file:\n\
834{\n\
835 database1: [\n\
836 {\n\
837 \"relations\": [\"A\", \"B\", ...],\n\
838 \"size\": 150\n\
839 },\n\
840 {\n\
841 \"relations\": [\"C\", \"A\", ...],\n\
842 \"size\": 100\n\
843 },\n\
844 },\n\
845 database2: [\n\
846 {\n\
847 \"relations\": [\"customers\"],\n\
848 \"size\": 1000\n\
849 },\n\
850 {\n\
851 \"relations\": [\"customers\", \"orders\", ...],\n\
852 \"size\": 50\n\
853 },\n\
854 },\n\
855}\n";
856 exit(EXIT_SUCCESS);
857 }
858 );
859 C.arg_parser().add<const char*>(
860 /* group= */ "Cardinality estimation",
861 /* short= */ nullptr,
862 /* long= */ "--use-cardinality-file",
863 /* description= */ "inject cardinalities from the given JSON file",
864 [] (const char *path) {
865 options::injected_cardinalities_file = path;
866 }
867 );
868}
#define LIST_CE(X)
#define REGISTER(CLASS)
__attribute__((constructor(202))) static void register_interpreter()
#define INSTANTIATE(CLASS)
std::size_t max_cardinality
‍maximum cardinality of relations and intermediate results
void add(const char *group_name, const char *short_name, const char *long_name, const char *description, Callback &&callback)
Adds a new group option to the ArgParser.
Definition: ArgParser.hpp:84
#define M_unreachable(MSG)
Definition: macro.hpp:146
#define M_insist(...)
Definition: macro.hpp:129
Bool< L > value
Definition: WasmUtil.hpp:1317
auto op
Definition: WasmDSL.hpp:2384
‍mutable namespace
Definition: Backend.hpp:10
ThreadSafeStringPool::proxy_type ThreadSafePooledString
Definition: Pool.hpp:464
auto visit(Callable &&callable, Base &obj, m::tag< Callable > &&=m::tag< Callable >())
Generic implementation to visit a class hierarchy, with similar syntax as std::visit.
Definition: Visitor.hpp:138
‍command-line options for the HeuristicSearchPlanEnumerator
Definition: V8Engine.cpp:44
data_model_exception is thrown if a DataModel implementation does not contain the requested informati...
virtual double predict_number_distinct_values(const DataModel &data) const
virtual void print(std::ostream &out) const =0
std::unique_ptr< DataModel > estimate_join(const QueryGraph &G, const DataModel &left, const DataModel &right, const cnf::CNF &condition) const override
std::unique_ptr< DataModel > estimate_grouping(const QueryGraph &G, const DataModel &data, const std::vector< group_type > &groups) const override
void print(std::ostream &out) const override
std::unique_ptr< DataModel > estimate_limit(const QueryGraph &G, const DataModel &data, std::size_t limit, std::size_t offset) const override
std::unique_ptr< DataModel > empty_model() const override
std::unique_ptr< DataModel > estimate_filter(const QueryGraph &G, const DataModel &data, const cnf::CNF &filter) const override
std::unique_ptr< DataModel > operator()(estimate_join_all_tag, PlanTable &&PT, const QueryGraph &G, Subproblem to_join, const cnf::CNF &condition) const
std::unique_ptr< DataModel > estimate_scan(const QueryGraph &G, Subproblem P) const override
std::size_t predict_cardinality(const DataModel &data) const override
The catalog contains all Databases and keeps track of all meta information of the database system.
Definition: Catalog.hpp:215
ThreadSafePooledString pool(const char *str) const
Creates an internalized copy of the string str by adding it to the internal StringPool.
Definition: Catalog.hpp:274
static Catalog & Get()
Return a reference to the single Catalog instance.
m::ArgParser & arg_parser()
Definition: Catalog.hpp:253
A DataModel describes a data set.
virtual ~DataModel()=0
std::ostream & w(const Position pos)
Definition: Diagnostic.hpp:36
std::unordered_map< ThreadSafePooledString, std::size_t > cardinality_table_
void print(std::ostream &out) const override
std::vector< char > buf_
‍buffer used to construct identifiers
std::unique_ptr< DataModel > estimate_grouping(const QueryGraph &G, const DataModel &data, const std::vector< group_type > &groups) const override
std::size_t predict_cardinality(const DataModel &data) const override
std::unique_ptr< DataModel > estimate_scan(const QueryGraph &G, Subproblem P) const override
std::unique_ptr< DataModel > estimate_join(const QueryGraph &G, const DataModel &left, const DataModel &right, const cnf::CNF &condition) const override
std::unique_ptr< DataModel > operator()(estimate_join_all_tag, PlanTable &&PT, const QueryGraph &G, Subproblem to_join, const cnf::CNF &condition) const
ThreadSafePooledString make_identifier(const QueryGraph &G, const Subproblem S) const
std::unique_ptr< DataModel > empty_model() const override
InjectionCardinalityEstimator(ThreadSafePooledString name_of_database)
Create an InjectionCardinalityEstimator for the database name_of_database from file that was passed b...
std::unique_ptr< DataModel > estimate_limit(const QueryGraph &G, const DataModel &data, std::size_t limit, std::size_t offset) const override
std::ostringstream oss_
‍buffer used to construct identifiers
void buf_append(const char *s) const
std::unique_ptr< DataModel > estimate_filter(const QueryGraph &G, const DataModel &data, const cnf::CNF &filter) const override
void read_json(Diagnostic &diag, std::istream &in, const ThreadSafePooledString &name_of_database)
A Type that represents the absence of any other type.
Definition: Type.hpp:204
The numeric type represents integer and floating-point types of different precision and scale.
Definition: Type.hpp:393
static Options & Get()
Return a reference to the single Options instance.
Definition: Options.cpp:9
This table represents all explored plans with their sub-plans, estimated size, cost,...
Definition: PlanTable.hpp:284
This table represents all explored plans with their sub-plans, estimated size, cost,...
Definition: PlanTable.hpp:180
A data type representing a pooled (or internalized) object.
Definition: Pool.hpp:168
Pooled< T, Pool, false > assert_not_none() const
Definition: Pool.hpp:239
The query graph represents all data sources and joins in a graph structure.
Definition: QueryGraph.hpp:172
const auto & sources() const
Definition: QueryGraph.hpp:257
Implements a small and efficient set over integers in the range of 0 to 63 (including).
Definition: ADT.hpp:26
std::size_t size() const
Returns the number of elements in this SmallBitset.
Definition: ADT.hpp:161
bool empty() const
Returns true if there are no elements in this SmallBitset.
Definition: ADT.hpp:163
auto end() const
Definition: ADT.hpp:175
auto begin() const
Definition: ADT.hpp:173
table_spn_map spns_
a map from table to Spn
std::vector< std::size_t > max_frequencies_
the maximum frequencies of values of attributes to join
std::unique_ptr< DataModel > operator()(estimate_join_all_tag, PlanTable &&PT, const QueryGraph &G, Subproblem to_join, const cnf::CNF &condition) const
std::unique_ptr< DataModel > estimate_join(const QueryGraph &G, const DataModel &left, const DataModel &right, const cnf::CNF &condition) const override
static std::size_t max_frequency(const SpnDataModel &data, SpnJoin &join)
Compute the maximum frequency of values of the attribute in the join.
void learn_spns()
Learn an Spn on every table in the database.
std::unordered_map< ThreadSafePooledString, SpnWrapper * > table_to_spn_
‍the map from every table to its respective Spn, initially empty
static std::pair< unsigned, bool > find_spn_id(const SpnDataModel &data, SpnJoin &join)
Function to compute which of the two join identifiers belongs to the given data model and which attri...
std::unique_ptr< DataModel > estimate_limit(const QueryGraph &G, const DataModel &data, std::size_t limit, std::size_t offset) const override
ThreadSafePooledString name_of_database_
‍the name of the database, the estimator is built on
std::unique_ptr< DataModel > estimate_scan(const QueryGraph &G, Subproblem P) const override
std::unique_ptr< DataModel > empty_model() const override
std::pair< SpnIdentifier, SpnIdentifier > SpnJoin
void print(std::ostream &out) const override
std::unordered_map< ThreadSafePooledString, std::reference_wrapper< const SpnWrapper > > table_spn_map
void learn_new_spn(const ThreadSafePooledString &name_of_table)
Add a new Spn for a table in the database.
std::unique_ptr< DataModel > estimate_grouping(const QueryGraph &G, const DataModel &data, const std::vector< group_type > &groups) const override
std::unique_ptr< DataModel > estimate_filter(const QueryGraph &G, const DataModel &data, const cnf::CNF &filter) const override
std::size_t predict_cardinality(const DataModel &data) const override
A wrapper class for an Spn to be used in the context of databases.
Definition: SpnWrapper.hpp:13
std::size_t estimate_number_distinct_values(const ThreadSafePooledString &attribute) const
Estimate the number of distinct values of the given attribute.
Definition: SpnWrapper.hpp:110
std::size_t num_rows() const
returns the number of rows in the SPN.
Definition: SpnWrapper.hpp:72
static std::unordered_map< ThreadSafePooledString, SpnWrapper * > learn_spn_database(const ThreadSafePooledString &name_of_database, std::unordered_map< ThreadSafePooledString, std::vector< Spn::LeafType > > leaf_types=decltype(leaf_types)())
Learn SPNs over the tables in the given database.
Definition: SpnWrapper.cpp:133
static SpnWrapper learn_spn_table(const ThreadSafePooledString &name_of_database, const ThreadSafePooledString &name_of_table, std::vector< Spn::LeafType > leaf_types=decltype(leaf_types)())
Learn an SPN over the given table.
Definition: SpnWrapper.cpp:11
const std::unordered_map< ThreadSafePooledString, unsigned > & get_attribute_to_id() const
Get the reference to the attribute to spn internal id mapping.
Definition: SpnWrapper.hpp:45
Tree structure for Sum Product Networks.
Definition: Spn.hpp:20
SpnOperator
Definition: Spn.hpp:28
@ GREATER
Definition: Spn.hpp:32
@ EQUAL
Definition: Spn.hpp:29
@ LESS
Definition: Spn.hpp:30
@ GREATER_EQUAL
Definition: Spn.hpp:33
@ LESS_EQUAL
Definition: Spn.hpp:31
@ IS_NULL
Definition: Spn.hpp:34
std::unordered_map< unsigned, std::pair< SpnOperator, float > > Filter
Definition: Spn.hpp:47
A binary expression.
Definition: AST.hpp:348
std::unique_ptr< Expr > lhs
Definition: AST.hpp:349
std::unique_ptr< Expr > rhs
Definition: AST.hpp:350
Token op() const
Definition: AST.hpp:377
A constant: a string literal or a numeric constant.
Definition: AST.hpp:213
A designator.
Definition: AST.hpp:134
Token table_name
Definition: AST.hpp:138
Token attr_name
Definition: AST.hpp:139
The error expression.
Definition: AST.hpp:116
const Type * type() const
Returns the Type of this Expr.
Definition: AST.hpp:58
A function application.
Definition: AST.hpp:246
A query expression for nested queries.
Definition: AST.hpp:389
TokenType type
Definition: Token.hpp:17
ThreadSafePooledOptionalString text
declared as optional for dummy tokens
Definition: Token.hpp:16
A unary expression: "+e", "-e", "~e", "NOT e".
Definition: AST.hpp:324
A CNF represents a conjunction of cnf::Clauses.
Definition: CNF.hpp:134
Signals that an index-based or key-based access was out of range.
Definition: exception.hpp:43