30 auto &child = as<const DataLayout::INode>(layout.
child()).at(idx);
31 M_insist(as<DataLayout::Leaf>(child.ptr.get())->index() == idx,
"index of entry must match index in leaf");
32 M_insist(child.offset_in_bits % 8 == 0,
"column must be byte-aligned");
33 return child.offset_in_bits / 8;
37void save_csv(
const std::string &csv_path,
const Eigen::MatrixXd &matrix,
const std::string &header =
"")
39 std::ofstream csv_file(csv_path);
41 std::cerr <<
"Filepath \"" << csv_path <<
"\" is invalid.";
45 if (not header.empty()) {
46 csv_file << header <<
'\n';
49 Eigen::IOFormat csvFmt(Eigen::StreamPrecision, Eigen::DontAlignCols,
51 csv_file << matrix.format(csvFmt) << std::endl;
62 using namespace std::chrono;
67 C.set_database_in_use(DB);
70 auto query = as<ast::SelectStmt>(stmt.release());
72 using duration_t = std::remove_reference_t<
decltype(C.timer())>::duration;
73 std::vector<duration_t> execution_times;
75 auto consumer = std::make_unique<NoOpOperator>(devnull);
77 auto measurement = C.timer().get(
"Execute the query");
78 execution_times.push_back(measurement.duration());
81 std::sort(execution_times.begin(), execution_times.end());
82 const std::size_t n = execution_times.size();
83 const Timer::duration median = (execution_times[(n - 1) / 2] + execution_times[n / 2]) / 2;
107 Eigen::MatrixXd feature_matrix(GS.
num_points(), 3);
108 Eigen::VectorXd target_vector(GS.
num_points());
116 table.push_back(C.
pool(
"val"), get_runtime_type<T>());
120 table.layout(factory);
121 uint8_t *mem_ptr =
reinterpret_cast<uint8_t*
>(table.store().memory().addr());
128 if constexpr (std::is_integral_v<T>) {
129 if constexpr (std::is_signed_v<T>) {
135 if constexpr (std::is_signed_v<T>) {
142 const std::vector<T> values = value_space.
sequence();
146 std::size_t row_index = 0;
147 std::ostringstream oss;
148 unsigned old_cardinality = 0;
150 GS([&](
unsigned cardinality,
double selectivity) {
151 if (cardinality != old_cardinality) {
152 const std::size_t delta_cardinality = cardinality - old_cardinality;
155 for (
unsigned i = 0; i != delta_cardinality; ++i) table.store().append();
156 M_insist(table.store().num_rows() == cardinality);
157 set_all_not_null(null_bitmap_column, table.num_attrs(), old_cardinality, cardinality);
159 fill_uniform(val_column, values, old_cardinality, cardinality);
160 M_insist(table.store().num_rows() == cardinality);
161 old_cardinality = cardinality;
169 oss <<
"SELECT 1 FROM filter WHERE val < " <<
T(value_space.
lo() + selectivity * value_space.
delta()) <<
';';
173 using namespace std::chrono;
174 const double time_in_millisecs = duration_cast<microseconds>(query_time - scan_time).count() / 1e3;
175 feature_matrix(row_index, 0) = 1;
176 feature_matrix(row_index, 1) = cardinality;
177 feature_matrix(row_index, 2) = selectivity;
178 target_vector(row_index) = time_in_millisecs;
185 return std::make_pair(feature_matrix, target_vector);
200 space_of_distinct_values,
205 Eigen::MatrixXd feature_matrix(GS.
num_points(), 3);
206 Eigen::VectorXd target_vector(GS.
num_points());
215 table.push_back(C.
pool(
"val"), get_runtime_type<T>());
219 table.layout(factory);
220 uint8_t *mem_ptr =
reinterpret_cast<uint8_t*
>(table.store().memory().addr());
225 std::vector<T> distinct_values;
226 std::size_t row_index = 0;
227 std::ostringstream oss;
228 unsigned old_cardinality = table.store().num_rows();
229 unsigned old_num_distinct_values = 0;
231 GS([&](
unsigned num_distinct_values,
unsigned cardinality) {
232 M_insist(table.store().num_rows() == old_cardinality);
234 if (old_num_distinct_values != num_distinct_values) {
235 if (old_cardinality > cardinality) {
237 for (
unsigned i = old_cardinality; i != cardinality; --i) table.store().drop();
238 }
else if (old_cardinality < cardinality) {
240 for (
unsigned i = old_cardinality; i != cardinality; ++i) table.store().append();
241 M_insist(table.store().num_rows() == cardinality);
242 set_all_not_null(null_bitmap_column, table.num_attrs(), cardinality, old_cardinality);
245 M_insist(table.store().num_rows() == cardinality);
248 if (num_distinct_values == 1) {
249 distinct_values = {
T(0) };
252 if constexpr (std::is_integral_v<T>) {
253 if constexpr (std::is_signed_v<T>) {
259 if constexpr (std::is_signed_v<T>) {
266 distinct_values = value_space.
sequence();
268 M_insist(distinct_values.size() == num_distinct_values);
271 fill_uniform(val_column, distinct_values, 0, cardinality);
273 old_cardinality = cardinality;
274 old_num_distinct_values = num_distinct_values;
278 }
else if (old_cardinality < cardinality) {
280 for (
unsigned i = old_cardinality; i != cardinality; ++i) table.store().append();
281 M_insist(table.store().num_rows() == cardinality);
282 set_all_not_null(null_bitmap_column, table.num_attrs(), old_cardinality, cardinality);
284 fill_uniform(val_column, distinct_values, old_cardinality, cardinality);
286 old_cardinality = cardinality;
290 }
else if (old_cardinality > cardinality) {
292 for (
unsigned i = old_cardinality; i != cardinality; --i) table.store().drop();
293 M_insist(table.store().num_rows() == cardinality);
304 oss <<
"SELECT val FROM group_by GROUP BY val;";
308 using namespace std::chrono;
309 const double time_in_millisecs = duration_cast<microseconds>(query_time - scan_time).count() / 1e3;
310 feature_matrix(row_index, 0) = 1;
311 feature_matrix(row_index, 1) = cardinality;
312 feature_matrix(row_index, 2) = num_distinct_values;
313 target_vector(row_index) = time_in_millisecs;
317 return std::make_pair(feature_matrix, target_vector);
337 gs::GridSearch GS(space_result_size, space_redundancy_left, space_redundancy_right,
338 space_cardinality_left, space_cardinality_right);
341 Eigen::MatrixXd feature_matrix(GS.
num_points(), 6);
342 Eigen::VectorXd target_vector(GS.
num_points());
351 table_left.push_back(C.
pool(
"val"), get_runtime_type<T>());
354 table_right.push_back(C.
pool(
"val"), get_runtime_type<T>());
360 table_left.layout(factory_left);
361 table_right.layout(factory_right);
362 uint8_t *mem_ptr_left =
reinterpret_cast<uint8_t*
>(table_left.store().memory().addr());
363 uint8_t *mem_ptr_right =
reinterpret_cast<uint8_t*
>(table_right.store().memory().addr());
365 uint8_t *null_bitmap_column_right = mem_ptr_right +
get_column_offset_in_bytes(table_right.layout(), table_right.num_attrs());
371 std::vector<T> distinct_values;
372 std::size_t row_index = 0;
373 unsigned old_cardinality_left = table_left.store().num_rows();
374 unsigned old_cardinality_right = table_right.store().num_rows();
375 GS([&](
unsigned result_size,
unsigned redundancy_left,
unsigned redundancy_right,
376 unsigned cardinality_left,
unsigned cardinality_right) {
377 M_insist(table_left.store().num_rows() == old_cardinality_left
and
378 table_right.store().num_rows() == old_cardinality_right);
381 const unsigned num_distinct_values_left = cardinality_left / redundancy_left;
382 const unsigned num_distinct_values_right = cardinality_right / redundancy_right;
383 const unsigned max_result_size = redundancy_right * redundancy_left
384 * std::min(num_distinct_values_left, num_distinct_values_right);
385 if (cardinality_left < redundancy_left or cardinality_right < redundancy_right or max_result_size < result_size)
388 if (old_cardinality_left < cardinality_left) {
390 for (
unsigned i = old_cardinality_left; i != cardinality_left; ++i) table_left.store().append();
391 M_insist(table_left.store().num_rows() == cardinality_left);
392 set_all_not_null(null_bitmap_column_left, table_left.num_attrs(), old_cardinality_left, cardinality_left);
393 generate_primary_keys(id_column_left, *table_left[0UL].type, old_cardinality_left, cardinality_left);
394 }
else if (old_cardinality_left > cardinality_left) {
396 for (
unsigned i = old_cardinality_left; i != cardinality_left; --i) table_left.store().drop();
398 M_insist(table_left.store().num_rows() == cardinality_left);
399 if (old_cardinality_right < cardinality_right) {
401 for (
unsigned i = old_cardinality_right; i != cardinality_right; ++i) table_right.store().append();
402 M_insist(table_right.store().num_rows() == cardinality_right);
403 set_all_not_null(null_bitmap_column_right, table_right.num_attrs(), old_cardinality_right, cardinality_right);
404 generate_primary_keys(id_column_right, *table_right[0UL].type, old_cardinality_right, cardinality_right);
405 }
else if (old_cardinality_right > cardinality_right) {
407 for (
unsigned i = old_cardinality_right; i != cardinality_right; --i) table_right.store().drop();
411 const std::size_t num_matching_values = cardinality_left == 0 or cardinality_right == 0 ? 0 :
412 std::round((
double(result_size) *
double(num_distinct_values_left) *
413 double(num_distinct_values_right)) /
414 (
double(cardinality_left) *
double(cardinality_right)));
415 const unsigned num_values = num_distinct_values_left + num_distinct_values_right - num_matching_values;
416 if (num_values == 0) {
417 distinct_values = { };
418 }
else if (num_values == 1) {
419 distinct_values = {
T(0) };
422 if constexpr (std::is_integral_v<T>) {
423 if constexpr (std::is_signed_v<T>) {
429 if constexpr (std::is_signed_v<T>) {
436 distinct_values = value_space.
sequence();
438 M_insist(distinct_values.size() == num_values);
440 std::vector<T> distinct_values_left(distinct_values.begin(),
441 distinct_values.begin() + num_distinct_values_left);
442 std::vector<T> distinct_values_right(distinct_values.rbegin(),
443 distinct_values.rbegin() + num_distinct_values_right);
444 M_insist(distinct_values_left.size() == num_distinct_values_left);
445 M_insist(distinct_values_right.size() == num_distinct_values_right);
449 fill_uniform(val_column_left, distinct_values_left, 0, cardinality_left);
450 fill_uniform(val_column_right, distinct_values_right, 0, cardinality_right);
452 old_cardinality_left = cardinality_left;
453 old_cardinality_right = cardinality_right;
461 DB,
"SELECT 1 FROM join_left, join_right WHERE join_left.val = join_right.val;");
464 using namespace std::chrono;
465 const double time_in_millisecs =
466 duration_cast<microseconds>(query_time - scan_time_left - scan_time_right).count() / 1e3;
467 feature_matrix(row_index, 0) = 1;
468 feature_matrix(row_index, 1) = cardinality_left;
469 feature_matrix(row_index, 2) = cardinality_right;
470 feature_matrix(row_index, 3) = redundancy_left;
471 feature_matrix(row_index, 4) = redundancy_right;
472 feature_matrix(row_index, 5) = result_size;
473 target_vector(row_index) = time_in_millisecs;
477 return std::make_pair(feature_matrix.block(0,0,row_index,6), target_vector.head(row_index));
490 auto[feature_training_matrix, target_training_vector] = generate_training_suite_filter<T>();
494 std::function<Eigen::MatrixXd(Eigen::MatrixXd)> transformation = [degree](Eigen::MatrixXd feature_matrix) {
499 M_insist(feature_matrix.cols() == 3);
500 feature_matrix.conservativeResize(feature_matrix.rows(), 2 * degree + 1);
501 for (
unsigned row = 0; row < feature_matrix.rows(); ++row) {
503 for (
unsigned i = 2; i <= degree; ++i) {
506 feature_matrix(row, 2 * i - 1) = feature_matrix(row, 1) * std::pow(feature_matrix(row, 2), i - 1);
508 feature_matrix(row, 2 * i) = std::pow(feature_matrix(row, 2), i);
511 return feature_matrix;
514 CostModel filter_model(feature_training_matrix, target_training_vector, transformation);
517 if (csv_folder_path !=
nullptr) {
519 std::string features_csv_path = std::string(csv_folder_path) +
"/filter_model_training_data.csv";
520 Eigen::MatrixXd Xy(feature_training_matrix.rows(),
521 feature_training_matrix.cols() - 1 + target_training_vector.cols());
523 Xy << feature_training_matrix.rightCols(feature_training_matrix.cols() - 1), target_training_vector;
524 save_csv(features_csv_path, Xy,
"num_rows,selectivity,time");
528 std::string coef_csv_path = std::string(csv_folder_path) +
"/filter_model_coef.csv";
539 auto[feature_training_matrix, target_training_vector] = generate_training_suite_group_by<T>();
540 CostModel group_by_model(feature_training_matrix, target_training_vector);
543 if (csv_folder_path !=
nullptr) {
545 std::string features_csv_path = std::string(csv_folder_path) +
"/group_by_model_training_data.csv";
546 Eigen::MatrixXd Xy(feature_training_matrix.rows(),
547 feature_training_matrix.cols() - 1 + target_training_vector.cols());
549 Xy << feature_training_matrix.rightCols(feature_training_matrix.cols() - 1), target_training_vector;
550 save_csv(features_csv_path, Xy,
"num_rows,num_distinct_values,time");
554 std::string coef_csv_path = std::string(csv_folder_path) +
"/group_by_model_coef.csv";
558 return group_by_model;
565 auto[feature_training_matrix, target_training_vector] = generate_training_suite_join<T>();
566 CostModel join_cost_model(feature_training_matrix, target_training_vector);
569 if (csv_folder_path !=
nullptr) {
571 std::string features_csv_path = std::string(csv_folder_path) +
"/join_model_training_data.csv";
572 Eigen::MatrixXd Xy(feature_training_matrix.rows(),
573 feature_training_matrix.cols() - 1 + target_training_vector.cols());
575 Xy << feature_training_matrix.rightCols(feature_training_matrix.cols() - 1), target_training_vector;
577 "num_rows_left,num_rows_right,redundancy_left,redundancy_right,result_size,time");
581 std::string coef_csv_path = std::string(csv_folder_path) +
"/join_model_coef.csv";
585 return join_cost_model;
590 auto filter_model = std::make_unique<CostModel>
592 auto join_model = std::make_unique<CostModel>(generate_join_cost_model<int32_t>());
593 auto grouping_model = std::make_unique<CostModel>(generate_group_by_cost_model<int32_t>());
594 return std::make_unique<TrainedCostFunction>(std::move(filter_model), std::move(join_model),
595 std::move(grouping_model));
598#define DEFINE(TYPE) \
599template CostModel CostModelFactory::generate_filter_cost_model<TYPE>(unsigned degree, const char *csv_folder_path); \
600template CostModel CostModelFactory::generate_group_by_cost_model<TYPE>(const char *csv_folder_path); \
601template CostModel CostModelFactory::generate_join_cost_model<TYPE>(const char *csv_folder_path)
std::pair< Eigen::MatrixXd, Eigen::VectorXd > generate_training_suite_group_by()
Generates data for training group-by-models.
uint64_t get_column_offset_in_bytes(const DataLayout &layout, std::size_t idx)
Returns the offset in bytes of the idx-th column in the DataLayout layout which is considered to repr...
std::pair< Eigen::MatrixXd, Eigen::VectorXd > generate_training_suite_filter()
Generates data for training filter models.
std::pair< Eigen::MatrixXd, Eigen::VectorXd > generate_training_suite_join()
Generates data for training join-models.
Timer::duration time_select_query_execution(Database &DB, const std::string &input)
Executes the given query and returns the median of query execution times for all SELECT queries.
static constexpr std::size_t NUM_DISTINCT_VALUES_IN_FILTER_EXPERIMENT
constexpr unsigned NUM_REPETITIONS
The number of times a benchmark should be repeated to reduce noise in the data.
void save_csv(const std::string &csv_path, const Eigen::MatrixXd &matrix, const std::string &header="")
Save matrix in a csv file.
static constexpr unsigned DEFAULT_FILTER_POLYNOMIAL_DEGREE
A model for predicting the costs of a physical operator.
Eigen::VectorXd get_coefficients() const
void set_all_not_null(uint8_t *column_ptr, std::size_t num_attrs, std::size_t begin, std::size_t end)
Sets all attributes of the begin-th row (including) to the end-th row (excluding) of column at addres...
std::unique_ptr< ast::Stmt > M_EXPORT statement_from_string(Diagnostic &diag, const std::string &str)
Use lexer, parser, and semantic analysis to create a Stmt from str.
void M_EXPORT execute_query(Diagnostic &diag, const ast::SelectStmt &stmt, std::unique_ptr< Consumer > consumer)
Optimizes and executes the given SelectStmt.
void generate_primary_keys(void *column_ptr, const Type &type, std::size_t begin, std::size_t end)
Generates primary keys of Type type for the begin-th row (including) to the end-th row (excluding) of...
std::enable_if_t< std::is_arithmetic_v< T >, void > M_EXPORT fill_uniform(T *column_ptr, std::vector< T > values, std::size_t begin, std::size_t end, Generator &&g=Generator())
Fills column at address column_ptr from begin-th row (including) to end-th row (excluding) with data ...
The catalog contains all Databases and keeps track of all meta information of the database system.
std::unique_ptr< Store > create_store(const Table &tbl) const
Creates a new Store for the given Table tbl.
Database & add_database(ThreadSafePooledString name)
Creates a new Database with the given name.
ThreadSafePooledString pool(const char *str) const
Creates an internalized copy of the string str by adding it to the internal StringPool.
static Catalog & Get()
Return a reference to the single Catalog instance.
void drop_database(const ThreadSafePooledString &name)
Drops the Database with the name.
void unset_database_in_use()
Unsets the Database that is currenly in use.
static CostModel generate_group_by_cost_model(const char *csv_folder_path=nullptr)
Generates a cost model for the group by operator.
static std::unique_ptr< CostFunction > get_cost_function()
Generates a CostFunction containing CostModels for cost estimation.
static CostModel generate_filter_cost_model(unsigned degree, const char *csv_folder_path=nullptr)
Generates a cost model for the filter operator.
static CostModel generate_join_cost_model(const char *csv_folder_path=nullptr)
Generates a cost model for the join operator.
A Database is a set of Tables, Functions, and Statistics.
Table & add_table(ThreadSafePooledString name)
Adds a new Table to this Database.
virtual void push_back(ThreadSafePooledString name, const PrimitiveType *type)=0
Adds a new attribute with the given name and type to the table.
static Pooled< Numeric > Get_Integer(category_t category, unsigned num_bytes)
Returns a Numeric type for integrals of given category and num_bytes bytes.
std::size_t num_points() const
std::vector< value_type > sequence() const
difference_type delta() const
Models how data is laid out in a linear address space.
const Node & child() const
returns a reference to the single child of this DataLayout