mutable
A Database System for Research and Fast Prototyping
Loading...
Searching...
No Matches
CostModel.cpp
Go to the documentation of this file.
2
5#include "util/GridSearch.hpp"
6#include "util/stream.hpp"
8#include <mutable/mutable.hpp>
9#include <type_traits>
10
11
12using namespace m;
13using namespace m::storage;
14
15
16static constexpr std::size_t NUM_DISTINCT_VALUES_IN_FILTER_EXPERIMENT = 100;
17static constexpr unsigned DEFAULT_FILTER_POLYNOMIAL_DEGREE = 9;
19constexpr unsigned NUM_REPETITIONS = 5;
20
21
22//======================================================================================================================
23// Helper Functions
24//======================================================================================================================
25
28uint64_t get_column_offset_in_bytes(const DataLayout &layout, std::size_t idx)
29{
30 auto &child = as<const DataLayout::INode>(layout.child()).at(idx);
31 M_insist(as<DataLayout::Leaf>(child.ptr.get())->index() == idx, "index of entry must match index in leaf");
32 M_insist(child.offset_in_bits % 8 == 0, "column must be byte-aligned");
33 return child.offset_in_bits / 8;
34}
35
37void save_csv(const std::string &csv_path, const Eigen::MatrixXd &matrix, const std::string &header = "")
38{
39 std::ofstream csv_file(csv_path);
40 if (!csv_file) {
41 std::cerr << "Filepath \"" << csv_path << "\" is invalid.";
42 exit(EXIT_FAILURE);
43 }
44
45 if (not header.empty()) {
46 csv_file << header << '\n';
47 }
48
49 Eigen::IOFormat csvFmt(Eigen::StreamPrecision, Eigen::DontAlignCols,
50 ",", "\n");
51 csv_file << matrix.format(csvFmt) << std::endl;
52}
53
54
55//======================================================================================================================
56// Query Function
57//======================================================================================================================
58
61{
62 using namespace std::chrono;
63
64 NullStream devnull;
65 Diagnostic diag(false, std::cout, std::cerr);
66 auto &C = Catalog::Get();
67 C.set_database_in_use(DB);
68
69 auto stmt = statement_from_string(diag, input);
70 auto query = as<ast::SelectStmt>(stmt.release());
71
72 using duration_t = std::remove_reference_t<decltype(C.timer())>::duration;
73 std::vector<duration_t> execution_times;
74 for (unsigned i = 0; i != NUM_REPETITIONS; ++i) {
75 auto consumer = std::make_unique<NoOpOperator>(devnull);
76 execute_query(diag, *query, std::move(consumer));
77 auto measurement = C.timer().get("Execute the query");
78 execution_times.push_back(measurement.duration());
79 }
80
81 std::sort(execution_times.begin(), execution_times.end());
82 const std::size_t n = execution_times.size();
83 const Timer::duration median = (execution_times[(n - 1) / 2] + execution_times[n / 2]) / 2; // median
84 return median;
85}
86
87//======================================================================================================================
88// Suite Functions
89//======================================================================================================================
90
92template<typename T>
93std::pair<Eigen::MatrixXd, Eigen::VectorXd> generate_training_suite_filter()
94{
95 Catalog &C = Catalog::Get();
96
97 /* Consider cardinalities from 0 to 1e7. */
98 gs::LinearSpace<unsigned> space_cardinality(0, 1e7, 4);
99 /* Define grid search. */
101 space_cardinality,
102 /* Consider selectivities from 0 to 1 (0% to 100%). */
104 );
105
106 /* Allocate feature matrix and target vector. */
107 Eigen::MatrixXd feature_matrix(GS.num_points(), 3); // #columns = #features + 1
108 Eigen::VectorXd target_vector(GS.num_points());
109
110 /*----- Set up database. -----------------------------------------------------------------------------------------*/
111 /* Create database. */
112 Database &DB = C.add_database(C.pool("$db_train_models"));
113 /* Create table. */
114 auto &table = DB.add_table(C.pool("filter"));
115 table.push_back(C.pool("id"), Type::Get_Integer(Type::TY_Vector, 4));
116 table.push_back(C.pool("val"), get_runtime_type<T>());
117 /* Set table store and data layout. */
118 table.store(C.create_store(C.pool("PaxStore"), table));
119 PAXLayoutFactory factory(PAXLayoutFactory::NTuples, space_cardinality.hi());
120 table.layout(factory); // consider maximal cardinality to reuse data layout
121 uint8_t *mem_ptr = reinterpret_cast<uint8_t*>(table.store().memory().addr());
122 uint8_t *null_bitmap_column = mem_ptr + get_column_offset_in_bytes(table.layout(), table.num_attrs());
123 void *id_column = reinterpret_cast<void*>(mem_ptr + get_column_offset_in_bytes(table.layout(), 0));
124 T *val_column = reinterpret_cast<T*>(mem_ptr + get_column_offset_in_bytes(table.layout(), 1));
125
126 /*----- Prepare data. --------------------------------------------------------------------------------------------*/
127 gs::LinearSpace<T> value_space = []() -> gs::LinearSpace<T> {
128 if constexpr (std::is_integral_v<T>) {
129 if constexpr (std::is_signed_v<T>) {
131 } else {
133 }
134 } else {
135 if constexpr (std::is_signed_v<T>) {
137 } else {
139 }
140 }
141 }();
142 const std::vector<T> values = value_space.sequence();
144
145 /*----- Perform grid search. -------------------------------------------------------------------------------------*/
146 std::size_t row_index = 0;
147 std::ostringstream oss;
148 unsigned old_cardinality = 0;
149 auto scan_time = time_select_query_execution(DB, "SELECT val FROM filter;");
150 GS([&](unsigned cardinality, double selectivity) {
151 if (cardinality != old_cardinality) {
152 const std::size_t delta_cardinality = cardinality - old_cardinality;
153
154 /* Fill store with new data. */
155 for (unsigned i = 0; i != delta_cardinality; ++i) table.store().append(); // allocate fresh rows in store
156 M_insist(table.store().num_rows() == cardinality);
157 set_all_not_null(null_bitmap_column, table.num_attrs(), old_cardinality, cardinality);
158 generate_primary_keys(id_column, *table[0UL].type, old_cardinality, cardinality);
159 fill_uniform(val_column, values, old_cardinality, cardinality);
160 M_insist(table.store().num_rows() == cardinality);
161 old_cardinality = cardinality;
162
163 /* Measure time to scan table. */
164 scan_time = time_select_query_execution(DB, "SELECT val FROM filter;");
165 }
166
167 /* Evaluate selection. */
168 oss.str("");
169 oss << "SELECT 1 FROM filter WHERE val < " << T(value_space.lo() + selectivity * value_space.delta()) << ';';
170 const Timer::duration query_time = time_select_query_execution(DB, oss.str());
171
172 /* Insert training data into feature matrix and target vector. */
173 using namespace std::chrono;
174 const double time_in_millisecs = duration_cast<microseconds>(query_time - scan_time).count() / 1e3;
175 feature_matrix(row_index, 0) = 1; // add 1 for y-intercept
176 feature_matrix(row_index, 1) = cardinality;
177 feature_matrix(row_index, 2) = selectivity;
178 target_vector(row_index) = time_in_millisecs;
179 ++row_index;
180 });
181
183 C.drop_database(DB);
184
185 return std::make_pair(feature_matrix, target_vector);
186}
187
189template<typename T>
190std::pair<Eigen::MatrixXd, Eigen::VectorXd> generate_training_suite_group_by()
191{
192 Catalog &C = Catalog::Get();
193
194 /* Consider the number of distinct values from 1 to 1e4. */
195 gs::LinearSpace<unsigned> space_of_distinct_values(1, 1e4, 10);
196 /* Consider cardinalities from 0 to 1e7. */
197 gs::LinearSpace<unsigned> space_cardinality(0, 1e7, 10);
198 /* Define grid search. */
200 space_of_distinct_values,
201 space_cardinality
202 );
203
204 /* Allocate feature matrix and target vector. */
205 Eigen::MatrixXd feature_matrix(GS.num_points(), 3); // #columns = #features + 1
206 Eigen::VectorXd target_vector(GS.num_points());
207
208
209 /*----- Set up database. -----------------------------------------------------------------------------------------*/
210 /* Create database. */
211 Database &DB = C.add_database(C.pool("$db_train_models"));
212 /* Create table. */
213 auto &table = DB.add_table(C.pool("group_by"));
214 table.push_back(C.pool("id"), Type::Get_Integer(Type::TY_Vector, 4));
215 table.push_back(C.pool("val"), get_runtime_type<T>());
216 /* Set table store and data layout. */
217 table.store(C.create_store(C.pool("PaxStore"), table));
218 PAXLayoutFactory factory(PAXLayoutFactory::NTuples, space_cardinality.hi());
219 table.layout(factory); // consider maximal cardinality to reuse data layout
220 uint8_t *mem_ptr = reinterpret_cast<uint8_t*>(table.store().memory().addr());
221 uint8_t *null_bitmap_column = mem_ptr + get_column_offset_in_bytes(table.layout(), table.num_attrs());
222 void *id_column = reinterpret_cast<void*>(mem_ptr + get_column_offset_in_bytes(table.layout(), 0));
223 T *val_column = reinterpret_cast<T*>(mem_ptr + get_column_offset_in_bytes(table.layout(), 1));
224
225 std::vector<T> distinct_values;
226 std::size_t row_index = 0;
227 std::ostringstream oss;
228 unsigned old_cardinality = table.store().num_rows();
229 unsigned old_num_distinct_values = 0;
230 auto scan_time = time_select_query_execution(DB, "SELECT val FROM group_by;");
231 GS([&](unsigned num_distinct_values, unsigned cardinality) {
232 M_insist(table.store().num_rows() == old_cardinality);
233
234 if (old_num_distinct_values != num_distinct_values) {
235 if (old_cardinality > cardinality) {
236 /* Shrink store. */
237 for (unsigned i = old_cardinality; i != cardinality; --i) table.store().drop();
238 } else if (old_cardinality < cardinality) {
239 /* Grow store. */
240 for (unsigned i = old_cardinality; i != cardinality; ++i) table.store().append();
241 M_insist(table.store().num_rows() == cardinality);
242 set_all_not_null(null_bitmap_column, table.num_attrs(), cardinality, old_cardinality);
243 generate_primary_keys(id_column, *table[0UL].type, old_cardinality, cardinality);
244 }
245 M_insist(table.store().num_rows() == cardinality);
246
247 /*----- Generate distinct values. ------------------------------------------------------------------------*/
248 if (num_distinct_values == 1) {
249 distinct_values = { T(0) };
250 } else {
251 gs::LinearSpace<T> value_space = [num_distinct_values]() -> gs::LinearSpace<T> {
252 if constexpr (std::is_integral_v<T>) {
253 if constexpr (std::is_signed_v<T>) {
254 return gs::LinearSpace<T>(-5000, 5000, num_distinct_values - 1);
255 } else {
256 return gs::LinearSpace<T>(0, 10000, num_distinct_values - 1);
257 }
258 } else {
259 if constexpr (std::is_signed_v<T>) {
260 return gs::LinearSpace<T>(-1, 1, num_distinct_values - 1);
261 } else {
262 return gs::LinearSpace<T>(0, 1, num_distinct_values - 1);
263 }
264 }
265 }();
266 distinct_values = value_space.sequence();
267 }
268 M_insist(distinct_values.size() == num_distinct_values);
269
270 /* Completely fill the entire column with the new distinct values. */
271 fill_uniform(val_column, distinct_values, 0, cardinality);
272
273 old_cardinality = cardinality;
274 old_num_distinct_values = num_distinct_values;
275
276 /* Measure time to scan table. */
277 scan_time = time_select_query_execution(DB, "SELECT val FROM group_by;");
278 } else if (old_cardinality < cardinality) {
279 /* Grow store. */
280 for (unsigned i = old_cardinality; i != cardinality; ++i) table.store().append();
281 M_insist(table.store().num_rows() == cardinality);
282 set_all_not_null(null_bitmap_column, table.num_attrs(), old_cardinality, cardinality);
283 generate_primary_keys(id_column, *table[0UL].type, old_cardinality, cardinality);
284 fill_uniform(val_column, distinct_values, old_cardinality, cardinality);
285
286 old_cardinality = cardinality;
287
288 /* Measure time to scan table. */
289 scan_time = time_select_query_execution(DB, "SELECT val FROM group_by;");
290 } else if (old_cardinality > cardinality) {
291 /* Shrink store. */
292 for (unsigned i = old_cardinality; i != cardinality; --i) table.store().drop();
293 M_insist(table.store().num_rows() == cardinality);
294
295 /* Measure time to scan table. */
296 scan_time = time_select_query_execution(DB, "SELECT val FROM group_by;");
297 }
298
299 /* Evaluate grouping. */
300 /* FIXME: uses hack for pre-allocation. */
301 oss.str("");
302 // oss << "SELECT n" << std::to_string(num_distinct_values) << " FROM (SELECT id, val AS n" << num_distinct_values
303 // << " FROM group_by) AS GB GROUP BY n" << num_distinct_values << ";";
304 oss << "SELECT val FROM group_by GROUP BY val;";
305 const Timer::duration query_time = time_select_query_execution(DB, oss.str());
306
307 /* Insert training data into feature matrix and target vector. */
308 using namespace std::chrono;
309 const double time_in_millisecs = duration_cast<microseconds>(query_time - scan_time).count() / 1e3;
310 feature_matrix(row_index, 0) = 1; // add 1 for y-intercept
311 feature_matrix(row_index, 1) = cardinality;
312 feature_matrix(row_index, 2) = num_distinct_values;
313 target_vector(row_index) = time_in_millisecs;
314 ++row_index;
315 });
316
317 return std::make_pair(feature_matrix, target_vector);
318}
319
321template<typename T>
322std::pair<Eigen::MatrixXd, Eigen::VectorXd> generate_training_suite_join()
323{
324 Catalog &C = Catalog::Get();
325
326 /* Consider the result size from 0 to 1e7. */
327 auto space_result_size = gs::LinearSpace<unsigned>(0, 1e7, 1);
328 /* Consider the left redundancy from 1 to 100. */
329 auto space_redundancy_left = gs::LinearSpace<unsigned>(1, 100, 1);
330 /* Consider the right redundancy from 1 to 100. */
331 auto space_redundancy_right = gs::LinearSpace<unsigned>(1, 100, 1);
332 /* Consider the left cardinality from 1 to 1e7. */
333 auto space_cardinality_left = gs::LinearSpace<unsigned>(1, 1e7, 1);
334 /* Consider the right cardinality from 1 to 1e7. */
335 auto space_cardinality_right = gs::LinearSpace<unsigned>(1, 1e7, 1);
336 /* Define grid search. */
337 gs::GridSearch GS(space_result_size, space_redundancy_left, space_redundancy_right,
338 space_cardinality_left, space_cardinality_right);
339
340 /* Allocate feature matrix and target vector. */
341 Eigen::MatrixXd feature_matrix(GS.num_points(), 6); // #columns = #features + 1
342 Eigen::VectorXd target_vector(GS.num_points());
343
344
345 /*----- Set up database. -----------------------------------------------------------------------------------------*/
346 /* Create database. */
347 Database &DB = C.add_database(C.pool("$db_train_models"));
348 /* Create tables. */
349 auto &table_left = DB.add_table(C.pool("join_left"));
350 table_left.push_back(C.pool("id"), Type::Get_Integer(Type::TY_Vector, 4));
351 table_left.push_back(C.pool("val"), get_runtime_type<T>());
352 auto &table_right = DB.add_table(C.pool("join_right"));
353 table_right.push_back(C.pool("id"), Type::Get_Integer(Type::TY_Vector, 4));
354 table_right.push_back(C.pool("val"), get_runtime_type<T>());
355 /* Set table stores and data layouts. */
356 table_left.store(C.create_store(C.pool("PaxStore"), table_left));
357 table_right.store(C.create_store(C.pool("PaxStore"), table_right));
358 PAXLayoutFactory factory_left(PAXLayoutFactory::NTuples, space_cardinality_left.hi());
359 PAXLayoutFactory factory_right(PAXLayoutFactory::NTuples, space_cardinality_right.hi());
360 table_left.layout(factory_left); // consider maximal cardinality to reuse data layout
361 table_right.layout(factory_right); // consider maximal cardinality to reuse data layout
362 uint8_t *mem_ptr_left = reinterpret_cast<uint8_t*>(table_left.store().memory().addr());
363 uint8_t *mem_ptr_right = reinterpret_cast<uint8_t*>(table_right.store().memory().addr());
364 uint8_t *null_bitmap_column_left = mem_ptr_left + get_column_offset_in_bytes(table_left.layout(), table_left.num_attrs());
365 uint8_t *null_bitmap_column_right = mem_ptr_right + get_column_offset_in_bytes(table_right.layout(), table_right.num_attrs());
366 void *id_column_left = reinterpret_cast<void*>(mem_ptr_left + get_column_offset_in_bytes(table_left.layout(), 0));
367 void *id_column_right = reinterpret_cast<void*>(mem_ptr_right + get_column_offset_in_bytes(table_right.layout(), 0));
368 T *val_column_left = reinterpret_cast<T*>(mem_ptr_left + get_column_offset_in_bytes(table_left.layout(), 1));
369 T *val_column_right = reinterpret_cast<T*>(mem_ptr_right + get_column_offset_in_bytes(table_right.layout(), 1));
370
371 std::vector<T> distinct_values;
372 std::size_t row_index = 0;
373 unsigned old_cardinality_left = table_left.store().num_rows();
374 unsigned old_cardinality_right = table_right.store().num_rows();
375 GS([&](unsigned result_size, unsigned redundancy_left, unsigned redundancy_right,
376 unsigned cardinality_left, unsigned cardinality_right) {
377 M_insist(table_left.store().num_rows() == old_cardinality_left and
378 table_right.store().num_rows() == old_cardinality_right);
379
380 /* Check if current feature state is valid. */
381 const unsigned num_distinct_values_left = cardinality_left / redundancy_left;
382 const unsigned num_distinct_values_right = cardinality_right / redundancy_right;
383 const unsigned max_result_size = redundancy_right * redundancy_left
384 * std::min(num_distinct_values_left, num_distinct_values_right);
385 if (cardinality_left < redundancy_left or cardinality_right < redundancy_right or max_result_size < result_size)
386 return;
387
388 if (old_cardinality_left < cardinality_left) {
389 /* Grow store. */
390 for (unsigned i = old_cardinality_left; i != cardinality_left; ++i) table_left.store().append();
391 M_insist(table_left.store().num_rows() == cardinality_left);
392 set_all_not_null(null_bitmap_column_left, table_left.num_attrs(), old_cardinality_left, cardinality_left);
393 generate_primary_keys(id_column_left, *table_left[0UL].type, old_cardinality_left, cardinality_left);
394 } else if (old_cardinality_left > cardinality_left) {
395 /* Shrink store. */
396 for (unsigned i = old_cardinality_left; i != cardinality_left; --i) table_left.store().drop();
397 }
398 M_insist(table_left.store().num_rows() == cardinality_left);
399 if (old_cardinality_right < cardinality_right) {
400 /* Grow store. */
401 for (unsigned i = old_cardinality_right; i != cardinality_right; ++i) table_right.store().append();
402 M_insist(table_right.store().num_rows() == cardinality_right);
403 set_all_not_null(null_bitmap_column_right, table_right.num_attrs(), old_cardinality_right, cardinality_right);
404 generate_primary_keys(id_column_right, *table_right[0UL].type, old_cardinality_right, cardinality_right);
405 } else if (old_cardinality_right > cardinality_right) {
406 /* Shrink store. */
407 for (unsigned i = old_cardinality_right; i != cardinality_right; --i) table_right.store().drop();
408 }
409
410 /*----- Generate distinct values. ----------------------------------------------------------------------------*/
411 const std::size_t num_matching_values = cardinality_left == 0 or cardinality_right == 0 ? 0 :
412 std::round((double(result_size) * double(num_distinct_values_left) *
413 double(num_distinct_values_right)) /
414 (double(cardinality_left) * double(cardinality_right)));
415 const unsigned num_values = num_distinct_values_left + num_distinct_values_right - num_matching_values;
416 if (num_values == 0) {
417 distinct_values = { };
418 } else if (num_values == 1) {
419 distinct_values = { T(0) };
420 } else {
421 gs::LinearSpace<T> value_space = [num_values]() -> gs::LinearSpace<T> {
422 if constexpr (std::is_integral_v<T>) {
423 if constexpr (std::is_signed_v<T>) {
424 return gs::LinearSpace<T>(-5e7, 5e7, num_values - 1);
425 } else {
426 return gs::LinearSpace<T>(0, 1e8, num_values - 1);
427 }
428 } else {
429 if constexpr (std::is_signed_v<T>) {
430 return gs::LinearSpace<T>(-1, 1, num_values - 1);
431 } else {
432 return gs::LinearSpace<T>(0, 1, num_values - 1);
433 }
434 }
435 }();
436 distinct_values = value_space.sequence();
437 }
438 M_insist(distinct_values.size() == num_values);
439 /* Split distinct_values between left store and right store. */
440 std::vector<T> distinct_values_left(distinct_values.begin(),
441 distinct_values.begin() + num_distinct_values_left);
442 std::vector<T> distinct_values_right(distinct_values.rbegin(),
443 distinct_values.rbegin() + num_distinct_values_right);
444 M_insist(distinct_values_left.size() == num_distinct_values_left);
445 M_insist(distinct_values_right.size() == num_distinct_values_right);
446
447
448 /* Completely fill the entire column with the new distinct values. */
449 fill_uniform(val_column_left, distinct_values_left, 0, cardinality_left);
450 fill_uniform(val_column_right, distinct_values_right, 0, cardinality_right);
451
452 old_cardinality_left = cardinality_left;
453 old_cardinality_right = cardinality_right;
454
455 /* Measure time to scan tables. */
456 auto scan_time_left = time_select_query_execution(DB, "SELECT val FROM join_left;");
457 auto scan_time_right = time_select_query_execution(DB, "SELECT val FROM join_right;");
458
459 /* Evaluate join. */
461 DB, "SELECT 1 FROM join_left, join_right WHERE join_left.val = join_right.val;");
462
463 /* Insert training data into feature matrix and target vector. */
464 using namespace std::chrono;
465 const double time_in_millisecs =
466 duration_cast<microseconds>(query_time - scan_time_left - scan_time_right).count() / 1e3;
467 feature_matrix(row_index, 0) = 1; // add 1 for y-intercept
468 feature_matrix(row_index, 1) = cardinality_left;
469 feature_matrix(row_index, 2) = cardinality_right;
470 feature_matrix(row_index, 3) = redundancy_left;
471 feature_matrix(row_index, 4) = redundancy_right;
472 feature_matrix(row_index, 5) = result_size;
473 target_vector(row_index) = time_in_millisecs;
474 ++row_index;
475 });
476
477 return std::make_pair(feature_matrix.block(0,0,row_index,6), target_vector.head(row_index));
478}
479
480
481//======================================================================================================================
482// CostModelFactory Methods
483//======================================================================================================================
484
485
486template<typename T>
487CostModel CostModelFactory::generate_filter_cost_model(unsigned degree, const char *csv_folder_path)
488{
489 /* Generate filter training data for use in linear regression during cost model construction */
490 auto[feature_training_matrix, target_training_vector] = generate_training_suite_filter<T>();
491
492 /* create transformation lambda to append polynomial features to the feature matrix.
493 * the selectivity feature is potentiated to capture its non-linear cost relation with linear regression. */
494 std::function<Eigen::MatrixXd(Eigen::MatrixXd)> transformation = [degree](Eigen::MatrixXd feature_matrix) {
495 /* expected features:
496 * feature_matrix[0] := y-intercept (values in this column are always 1)
497 * feature_matrix[1] := number of input table rows to be filtered
498 * feature_matrix[2] := selectivity of the filter operation */
499 M_insist(feature_matrix.cols() == 3);
500 feature_matrix.conservativeResize(feature_matrix.rows(), 2 * degree + 1);
501 for (unsigned row = 0; row < feature_matrix.rows(); ++row) {
502 /* degree := maximum polynomial degree used to generate polynomial features for selectivity */
503 for (unsigned i = 2; i <= degree; ++i) {
504 /* append an intersection feature between number of rows and selectivity
505 * (feature_matrix[1] times feature_matrix[2] to the power of i - 1) */
506 feature_matrix(row, 2 * i - 1) = feature_matrix(row, 1) * std::pow(feature_matrix(row, 2), i - 1);
507 /* append a feature that potentiates selectivity to the power of i */
508 feature_matrix(row, 2 * i) = std::pow(feature_matrix(row, 2), i);
509 }
510 }
511 return feature_matrix;
512 };
513
514 CostModel filter_model(feature_training_matrix, target_training_vector, transformation);
515
516 /* export matrices to csv. */
517 if (csv_folder_path != nullptr) {
518 /* Training Data */
519 std::string features_csv_path = std::string(csv_folder_path) + "/filter_model_training_data.csv";
520 Eigen::MatrixXd Xy(feature_training_matrix.rows(),
521 feature_training_matrix.cols() - 1 + target_training_vector.cols());
522 // remove '1s' column from target matrix
523 Xy << feature_training_matrix.rightCols(feature_training_matrix.cols() - 1), target_training_vector;
524 save_csv(features_csv_path, Xy, "num_rows,selectivity,time");
525
526 /* Estimated Coefficient. */
527 /* try to open filestream. */
528 std::string coef_csv_path = std::string(csv_folder_path) + "/filter_model_coef.csv";
529 save_csv(coef_csv_path, filter_model.get_coefficients());
530 }
531
532 return filter_model;
533}
534
535template<typename T>
537{
538 /* Generate group by training data for use in linear regression during cost model construction */
539 auto[feature_training_matrix, target_training_vector] = generate_training_suite_group_by<T>();
540 CostModel group_by_model(feature_training_matrix, target_training_vector);
541
542 /* export matrices to csv. */
543 if (csv_folder_path != nullptr) {
544 /* Training Data */
545 std::string features_csv_path = std::string(csv_folder_path) + "/group_by_model_training_data.csv";
546 Eigen::MatrixXd Xy(feature_training_matrix.rows(),
547 feature_training_matrix.cols() - 1 + target_training_vector.cols());
548 /* remove '1s' column from target matrix */
549 Xy << feature_training_matrix.rightCols(feature_training_matrix.cols() - 1), target_training_vector;
550 save_csv(features_csv_path, Xy, "num_rows,num_distinct_values,time");
551
552 /* Estimated Coefficient. */
553 /* try to open filestream. */
554 std::string coef_csv_path = std::string(csv_folder_path) + "/group_by_model_coef.csv";
555 save_csv(coef_csv_path, group_by_model.get_coefficients());
556 }
557
558 return group_by_model;
559}
560
561template<typename T>
563{
564 /* Generate join training data for use in linear regression during cost model construction */
565 auto[feature_training_matrix, target_training_vector] = generate_training_suite_join<T>();
566 CostModel join_cost_model(feature_training_matrix, target_training_vector);
567
568 /* export matrices to csv. */
569 if (csv_folder_path != nullptr) {
570 /* Training Data */
571 std::string features_csv_path = std::string(csv_folder_path) + "/join_model_training_data.csv";
572 Eigen::MatrixXd Xy(feature_training_matrix.rows(),
573 feature_training_matrix.cols() - 1 + target_training_vector.cols());
574 // remove '1s' column from target matrix
575 Xy << feature_training_matrix.rightCols(feature_training_matrix.cols() - 1), target_training_vector;
576 save_csv(features_csv_path, Xy,
577 "num_rows_left,num_rows_right,redundancy_left,redundancy_right,result_size,time");
578
579 /* Estimated Coefficient. */
580 /* try to open filestream. */
581 std::string coef_csv_path = std::string(csv_folder_path) + "/join_model_coef.csv";
582 save_csv(coef_csv_path, join_cost_model.get_coefficients());
583 }
584
585 return join_cost_model;
586}
587
588std::unique_ptr<CostFunction> CostModelFactory::get_cost_function()
589{
590 auto filter_model = std::make_unique<CostModel>
591 (generate_filter_cost_model<int32_t>(DEFAULT_FILTER_POLYNOMIAL_DEGREE));
592 auto join_model = std::make_unique<CostModel>(generate_join_cost_model<int32_t>());
593 auto grouping_model = std::make_unique<CostModel>(generate_group_by_cost_model<int32_t>());
594 return std::make_unique<TrainedCostFunction>(std::move(filter_model), std::move(join_model),
595 std::move(grouping_model));
596}
597
598#define DEFINE(TYPE) \
599template CostModel CostModelFactory::generate_filter_cost_model<TYPE>(unsigned degree, const char *csv_folder_path); \
600template CostModel CostModelFactory::generate_group_by_cost_model<TYPE>(const char *csv_folder_path); \
601template CostModel CostModelFactory::generate_join_cost_model<TYPE>(const char *csv_folder_path)
602DEFINE(int8_t);
603DEFINE(int16_t);
604DEFINE(int32_t);
605DEFINE(int64_t);
606DEFINE(float);
607DEFINE(double);
608#undef DEFINE
std::pair< Eigen::MatrixXd, Eigen::VectorXd > generate_training_suite_group_by()
Generates data for training group-by-models.
Definition: CostModel.cpp:190
uint64_t get_column_offset_in_bytes(const DataLayout &layout, std::size_t idx)
Returns the offset in bytes of the idx-th column in the DataLayout layout which is considered to repr...
Definition: CostModel.cpp:28
std::pair< Eigen::MatrixXd, Eigen::VectorXd > generate_training_suite_filter()
Generates data for training filter models.
Definition: CostModel.cpp:93
std::pair< Eigen::MatrixXd, Eigen::VectorXd > generate_training_suite_join()
Generates data for training join-models.
Definition: CostModel.cpp:322
Timer::duration time_select_query_execution(Database &DB, const std::string &input)
Executes the given query and returns the median of query execution times for all SELECT queries.
Definition: CostModel.cpp:60
static constexpr std::size_t NUM_DISTINCT_VALUES_IN_FILTER_EXPERIMENT
Definition: CostModel.cpp:16
#define DEFINE(TYPE)
Definition: CostModel.cpp:598
constexpr unsigned NUM_REPETITIONS
The number of times a benchmark should be repeated to reduce noise in the data.
Definition: CostModel.cpp:19
void save_csv(const std::string &csv_path, const Eigen::MatrixXd &matrix, const std::string &header="")
Save matrix in a csv file.
Definition: CostModel.cpp:37
static constexpr unsigned DEFAULT_FILTER_POLYNOMIAL_DEGREE
Definition: CostModel.cpp:17
A model for predicting the costs of a physical operator.
Definition: LinearModel.hpp:11
Eigen::VectorXd get_coefficients() const
Definition: LinearModel.hpp:42
#define M_insist(...)
Definition: macro.hpp:129
‍mutable namespace
Definition: Backend.hpp:10
void set_all_not_null(uint8_t *column_ptr, std::size_t num_attrs, std::size_t begin, std::size_t end)
Sets all attributes of the begin-th row (including) to the end-th row (excluding) of column at addres...
T(x)
and
Definition: enum_ops.hpp:12
std::unique_ptr< ast::Stmt > M_EXPORT statement_from_string(Diagnostic &diag, const std::string &str)
Use lexer, parser, and semantic analysis to create a Stmt from str.
Definition: mutable.cpp:25
void M_EXPORT execute_query(Diagnostic &diag, const ast::SelectStmt &stmt, std::unique_ptr< Consumer > consumer)
Optimizes and executes the given SelectStmt.
Definition: mutable.cpp:368
void generate_primary_keys(void *column_ptr, const Type &type, std::size_t begin, std::size_t end)
Generates primary keys of Type type for the begin-th row (including) to the end-th row (excluding) of...
std::enable_if_t< std::is_arithmetic_v< T >, void > M_EXPORT fill_uniform(T *column_ptr, std::vector< T > values, std::size_t begin, std::size_t end, Generator &&g=Generator())
Fills column at address column_ptr from begin-th row (including) to end-th row (excluding) with data ...
Definition: store_manip.hpp:30
The catalog contains all Databases and keeps track of all meta information of the database system.
Definition: Catalog.hpp:215
std::unique_ptr< Store > create_store(const Table &tbl) const
Creates a new Store for the given Table tbl.
Definition: Catalog.hpp:352
Database & add_database(ThreadSafePooledString name)
Creates a new Database with the given name.
Definition: Catalog.cpp:58
ThreadSafePooledString pool(const char *str) const
Creates an internalized copy of the string str by adding it to the internal StringPool.
Definition: Catalog.hpp:274
static Catalog & Get()
Return a reference to the single Catalog instance.
void drop_database(const ThreadSafePooledString &name)
Drops the Database with the name.
Definition: Catalog.cpp:66
void unset_database_in_use()
Unsets the Database that is currenly in use.
Definition: Catalog.hpp:305
static CostModel generate_group_by_cost_model(const char *csv_folder_path=nullptr)
Generates a cost model for the group by operator.
Definition: CostModel.cpp:536
static std::unique_ptr< CostFunction > get_cost_function()
Generates a CostFunction containing CostModels for cost estimation.
Definition: CostModel.cpp:588
static CostModel generate_filter_cost_model(unsigned degree, const char *csv_folder_path=nullptr)
Generates a cost model for the filter operator.
Definition: CostModel.cpp:487
static CostModel generate_join_cost_model(const char *csv_folder_path=nullptr)
Generates a cost model for the join operator.
Definition: CostModel.cpp:562
A Database is a set of Tables, Functions, and Statistics.
Definition: Schema.hpp:870
Table & add_table(ThreadSafePooledString name)
Adds a new Table to this Database.
Definition: Schema.cpp:316
virtual void push_back(ThreadSafePooledString name, const PrimitiveType *type)=0
Adds a new attribute with the given name and type to the table.
clock::duration duration
Definition: Timer.hpp:40
static Pooled< Numeric > Get_Integer(category_t category, unsigned num_bytes)
Returns a Numeric type for integrals of given category and num_bytes bytes.
Definition: Type.cpp:94
std::size_t num_points() const
Definition: GridSearch.hpp:149
std::vector< value_type > sequence() const
Definition: GridSearch.hpp:114
value_type lo() const
Definition: GridSearch.hpp:88
difference_type delta() const
Definition: GridSearch.hpp:92
value_type hi() const
Definition: GridSearch.hpp:89
Models how data is laid out in a linear address space.
Definition: DataLayout.hpp:29
const Node & child() const
‍returns a reference to the single child of this DataLayout
Definition: DataLayout.hpp:209