mutable
A Database System for Research and Fast Prototyping
Loading...
Searching...
No Matches
train-operator-model.cpp
Go to the documentation of this file.
2#include "util/GridSearch.hpp"
3#include <Eigen/LU>
4#include <fstream>
5#include <iostream>
9#include <mutable/Options.hpp>
11
12
13using namespace m;
14using namespace Eigen;
15
16
20typedef Matrix<double, Dynamic, Dynamic, RowMajor> RowMatrixXd;
21std::pair<RowMatrixXd, VectorXd> load_csv (const char *csv_path)
22{
23 std::ifstream csv_file(csv_path);
24 if (!csv_file) {
25 std::cerr << "Filepath \"" << csv_path << "\" is invalid.";
26 exit(EXIT_FAILURE);
27 }
28
29 std::vector<double> feature_values;
30 std::vector<double> target_values;
31 std::string line;
32 // parse header
33 std::getline(csv_file, line);
34 std::stringstream ls(line);
35 std::string ss;
36 unsigned index = 0;
37 while (std::getline(ls, ss, ',')) {
38 if (ss.find("time") != std::string::npos) {
39 break;
40 }
41 ++index;
42 }
43
44 // parse values
45 unsigned rows = 0;
46 while (std::getline(csv_file, line)) {
47 // fill first column with 1.0 the y-intersect
48 feature_values.push_back(1.0);
49
50 std::stringstream line_stream(line);
51 std::string cell;
52 unsigned i = 0;
53 while (std::getline(line_stream, cell, ',')) {
54 // check if feature or target value
55 if (i == index) {
56 target_values.push_back(std::stod(cell));
57 } else {
58 feature_values.push_back(std::stod(cell));
59 }
60 ++i;
61 }
62 ++rows;
63 }
64 Map<RowMatrixXd> feature_matrix(feature_values.data(), rows, feature_values.size()/rows);
65 Map<VectorXd> target_vector(target_values.data(), rows, 1);
66 return std::pair<RowMatrixXd, VectorXd>(feature_matrix, target_vector);
67}
68
72template<typename T>
73CostModel load_filter_cost_model(const char *csv_path, unsigned degree = 9)
74{
75 std::ifstream csv_file(csv_path);
76 if (!csv_file) {
77 std::cerr << "Filepath \"" << csv_path << "\" is invalid.";
78 exit(EXIT_FAILURE);
79 }
80
81 // parse coefficients
82 std::vector<double> coefficients;
83 std::string line;
84 unsigned rows = 0;
85 while (std::getline(csv_file, line)) {
86 coefficients.push_back(std::stod(line));
87 ++rows;
88 }
89 M_insist(std::floor(rows / 2) == degree);
90 Map<VectorXd> coefficients_vector(coefficients.data(),rows, 1);
91
92 return CostModel(coefficients_vector, 3, [degree](Eigen::MatrixXd featureMatrix) {
93 M_insist(featureMatrix.cols() == 3);
94 featureMatrix.conservativeResize(featureMatrix.rows(), 2 * degree + 1);
95 for (unsigned row = 0; row < featureMatrix.rows(); ++row) {
96 for (unsigned i = 2; i <= degree; ++i) {
97 featureMatrix(row, 2 * i - 1) = featureMatrix(row, 1) * std::pow(featureMatrix(row, 2), i - 1);
98 featureMatrix(row, 2 * i) = std::pow(featureMatrix(row, 2), i);
99 }
100 }
101 return featureMatrix;
102 });
103}
104
108template<typename T>
109CostModel load_cost_model(const char *csv_path)
110{
111 std::ifstream csv_file(csv_path);
112 if (!csv_file) {
113 std::cerr << "Filepath \"" << csv_path << "\" is invalid.";
114 exit(EXIT_FAILURE);
115 }
116
117 // parse coefficients
118 std::vector<double> coefficients;
119 std::string line;
120 unsigned rows = 0;
121 while (std::getline(csv_file, line)) {
122 coefficients.push_back(std::stod(line));
123 ++rows;
124 }
125 Map<VectorXd> coefficients_vector(coefficients.data(),rows, 1);
126
127 return CostModel(coefficients_vector);
128}
129
130//======================================================================================================================
131// Main
132//======================================================================================================================
133
134
135void usage(std::ostream &out, const char *name)
136{
137 out << "A command line tool to generate physical operator cost models.\n"
138 << "USAGE:\n\t" << name << " <CSV-FOLDER>"
139 << std::endl;
140}
141
142int main(int argc, const char **argv)
143{
144 struct {
145 bool show_help; // show help
146
147 /* Operator Models */
148 const char* gen_filter_model;
149 const char* gen_group_by_model;
150 const char* gen_join_model;
151
152 const char* load_filter_model;
153 const char* load_group_by_model;
154 const char* load_join_model;
155
156 const char* eval_filter_model;
157 const char* eval_group_by_model;
158 const char* eval_join_model;
159
160 /* Filter Model polynomial degree*/
161 unsigned degree;
162
163 /* prediction: feature values */
164 double num_rows;
165 double num_distinct_values;
166 double selectivity;
167
168 double num_rows_left;
169 double num_rows_right;
170 double redundancy_left;
171 double redundancy_right;
172 double result_size;
173
174 /* Catalog Options */
175 const char *backend;
176 } args;
177
178 /*----- Parse command line arguments. ----------------------------------------------------------------------------*/
179 ArgParser AP;
180 Catalog &C = Catalog::Get();
181#define ADD(TYPE, VAR, INIT, SHORT, LONG, DESCR, CALLBACK)\
182 VAR = INIT;\
183 {\
184 AP.add<TYPE>(SHORT, LONG, DESCR, CALLBACK);\
185 }
186 ADD(bool, args.show_help, false, /* Type, Var, Init */
187 "-h", "--help", /* Short, Long */
188 "prints this help message", /* Description */
189 [&](bool) { args.show_help = true; }); /* Callback */
190 ADD(const char*, args.gen_filter_model, nullptr, /* Type, Var, Init */
191 "-f", "--filter", /* Short, Long */
192 "generate a filter cost model and saves it in the given folder", /* Description */
193 [&](const char* str) { args.gen_filter_model = str; }); /* Callback */
194 ADD(const char*, args.gen_group_by_model, nullptr, /* Type, Var, Init */
195 "-g", "--group_by", /* Short, Long */
196 "generate a group by cost model and saves it in the given folder", /* Description */
197 [&](const char* str) { args.gen_group_by_model = str; }); /* Callback */
198 ADD(const char*, args.gen_join_model, nullptr, /* Type, Var, Init */
199 "-j", "--join", /* Short, Long */
200 "generate a join cost modeland saves it in the given folder", /* Description */
201 [&](const char* str) { args.gen_join_model = str; }); /* Callback */
202 ADD(const char *, args.load_filter_model, nullptr, /* Type, Var, Init */
203 nullptr, "--load_filter", /* Short, Long */
204 "load a filter model from csv file", /* Description */
205 [&](const char *str) { args.load_filter_model = str; }); /* Callback */
206 ADD(const char *, args.load_group_by_model, nullptr, /* Type, Var, Init */
207 nullptr, "--load_group_by", /* Short, Long */
208 "load a group by model from csv file", /* Description */
209 [&](const char *str) { args.load_group_by_model = str; }); /* Callback */
210 ADD(const char *, args.load_join_model, nullptr, /* Type, Var, Init */
211 nullptr, "--load_join", /* Short, Long */
212 "load a join model from csv file", /* Description */
213 [&](const char *str) { args.load_join_model = str; }); /* Callback */
214 ADD(const char *, args.eval_filter_model, nullptr, /* Type, Var, Init */
215 nullptr, "--eval_filter", /* Short, Long */
216 "load & evaluate a filter model from csv file", /* Description */
217 [&](const char *str) { args.eval_filter_model = str; }); /* Callback */
218 ADD(const char *, args.eval_group_by_model, nullptr, /* Type, Var, Init */
219 nullptr, "--eval_group_by", /* Short, Long */
220 "load & evaluate a group by model from csv file", /* Description */
221 [&](const char *str) { args.eval_group_by_model = str; }); /* Callback */
222 ADD(const char *, args.eval_join_model, nullptr, /* Type, Var, Init */
223 nullptr, "--eval_join", /* Short, Long */
224 "load & evaluate a join model from csv file", /* Description */
225 [&](const char *str) { args.eval_join_model = str; }); /* Callback */
226 ADD(int, args.degree, 9, /* Type, Var, Init */
227 nullptr, "--degree", /* Short, Long */
228 "set the polynomial degree used in the filter cost model (default = 9)", /* Description */
229 [&](int nr) { args.degree = nr; }); /* Callback */
230 ADD(int, args.num_rows, 0, /* Type, Var, Init */
231 nullptr, "--num_rows", /* Short, Long */
232 "set the number of rows used in the cost model prediction", /* Description */
233 [&](int nr) { args.num_rows = double(nr); }); /* Callback */
234 ADD(int, args.num_distinct_values, 0, /* Type, Var, Init */
235 nullptr, "--num_distinct_values", /* Short, Long */
236 "set the number of distinct values used in the cost model prediction", /* Description */
237 [&](int ndv) { args.num_distinct_values = double(ndv); }); /* Callback */
238 ADD(int, args.selectivity, 0, /* Type, Var, Init */
239 nullptr, "--selectivity", /* Short, Long */
240 "set the selectivity used in the cost model prediction (in %)", /* Description */
241 [&](int sel) { args.selectivity = double(sel) / 100.0; }); /* Callback */
242 ADD(int, args.num_rows_left, 0, /* Type, Var, Init */
243 nullptr, "--num_rows_left", /* Short, Long */
244 "set the number of rows used in the cost model prediction (join only)", /* Description */
245 [&](int nr) { args.num_rows_left = double(nr); }); /* Callback */
246 ADD(int, args.num_rows_right, 0, /* Type, Var, Init */
247 nullptr, "--num_rows_right", /* Short, Long */
248 "set the number of rows used in the cost model prediction (join only)", /* Description */
249 [&](int nr) { args.num_rows_right = double(nr); }); /* Callback */
250 ADD(int, args.redundancy_left, 1, /* Type, Var, Init */
251 nullptr, "--redundancy_left", /* Short, Long */
252 "set the redundancy of a value in the cost model prediction (join only)", /* Description */
253 [&](int red) { args.redundancy_left = double(red); }); /* Callback */
254 ADD(int, args.redundancy_right, 1, /* Type, Var, Init */
255 nullptr, "--redundancy_right", /* Short, Long */
256 "set the redundancy of a value in the cost model prediction (join only)", /* Description */
257 [&](int red) { args.redundancy_right = double(red); }); /* Callback */
258 ADD(int, args.result_size, 0, /* Type, Var, Init */
259 nullptr, "--result_size", /* Short, Long */
260 "set the size of the result in the cost model prediction (join only)", /* Description */
261 [&](int res) { args.result_size = double(res); }); /* Callback */
262 /*----- Select backend implementation ----------------------------------------------------------------------------*/
263 ADD(const char *, args.backend, /* Type, Var */
264 "WasmV8", /* Init */
265 nullptr, "--backend", /* Short, Long */
266 "specify the execution backend", /* Description */
267 /* Callback */
268 [&](const char *str) {
269 try {
270 C.default_backend(C.pool(str));
271 args.backend = str;
272 } catch (std::invalid_argument) {
273 std::cerr << "There is no execution backend with the name \"" << str << "\".\n" << AP;
274 std::exit(EXIT_FAILURE);
275 }
276 }
277 );
278#undef ADD
279 AP.parse_args(argc, argv);
280
281 if (args.show_help) {
282 usage(std::cout, argv[0]);
283 std::cout << "WHERE\n" << AP;
284 std::exit(EXIT_SUCCESS);
285 }
286
287 if (AP.args().size() != 0) {
288 std::cerr << "ERROR: Too many arguments.\n";
289 usage(std::cerr, argv[0]);
290 std::exit(EXIT_FAILURE);
291 }
292
293 if (args.gen_filter_model) {
294 std::cout << "Measurement data will be written to '" << args.gen_filter_model << "'.\n";
295 auto costmodel = CostModelFactory::get_cost_model<int32_t>(OperatorKind::FilterOperator,
296 args.gen_filter_model,
297 args.degree);
298 // create feature vector for cost prediction
299 Eigen::RowVectorXd feature_matrix(2);
300 feature_matrix << args.num_rows, args.selectivity;
301 std::cout << costmodel.predict_target(feature_matrix) << std::endl;
302 exit(EXIT_SUCCESS);
303 }
304
305 if (args.gen_group_by_model) {
306 std::cout << "Measurement data will be written to '" << args.gen_group_by_model << "'.\n";
307 auto costmodel = CostModelFactory::get_cost_model<int32_t>(OperatorKind::GroupingOperator,
308 args.gen_group_by_model);
309 // create feature vector for cost prediction
310 Eigen::RowVectorXd feature_matrix(2);
311 feature_matrix << args.num_rows, args.num_distinct_values;
312 std::cout << costmodel.predict_target(feature_matrix) << std::endl;
313 exit(EXIT_SUCCESS);
314 }
315
316 if (args.gen_join_model) {
317 std::cout << "Measurement data will be written to '" << args.gen_join_model << "'.\n";
318 auto costmodel = CostModelFactory::get_cost_model<int32_t>(OperatorKind::JoinOperator,
319 args.gen_join_model);
320 // create feature vector for cost prediction
321 Eigen::RowVectorXd feature_matrix(5);
322 feature_matrix << args.num_rows_left, args.num_rows_right, args.redundancy_left, args.redundancy_right,
323 args.result_size;
324 std::cout << costmodel.predict_target(feature_matrix) << std::endl;
325 exit(EXIT_SUCCESS);
326 }
327
328 if (args.load_filter_model) {
329 auto costmodel = load_filter_cost_model<int32_t>(args.load_filter_model);
330 // create feature vector for cost prediction
331 Eigen::RowVectorXd feature_matrix(2);
332 feature_matrix << args.num_rows, args.selectivity;
333 std::cout << costmodel.predict_target(feature_matrix) << std::endl;
334 exit(EXIT_SUCCESS);
335 }
336
337 if (args.load_group_by_model) {
338 auto costmodel = load_cost_model<int32_t>(args.load_group_by_model);
339 // create feature vector for cost prediction
340 Eigen::RowVectorXd feature_matrix(2);
341 feature_matrix << args.num_rows, args.num_distinct_values;
342 std::cout << costmodel.predict_target(feature_matrix) << std::endl;
343 exit(EXIT_SUCCESS);
344 }
345
346 if (args.load_join_model) {
347 auto costmodel = load_cost_model<int32_t>(args.load_join_model);
348 // create feature vector for cost prediction
349 Eigen::RowVectorXd feature_matrix(5);
350 feature_matrix << args.num_rows_left, args.num_rows_right, args.redundancy_left, args.redundancy_right,
351 args.result_size;
352 std::cout << costmodel.predict_target(feature_matrix) << std::endl;
353 exit(EXIT_SUCCESS);
354 }
355
356 if (args.eval_filter_model) {
357 // TODO
358 }
359
360 if (args.eval_group_by_model) {
361 // TODO
362 }
363
364 if (args.eval_join_model) {
365 // TODO
366 }
367
368 exit(EXIT_SUCCESS);
369}
int main(void)
bool show_help
‍whether to show a help message
struct @5 args
A parser for command line arguments.
Definition: ArgParser.hpp:20
A model for predicting the costs of a physical operator.
Definition: LinearModel.hpp:11
#define M_insist(...)
Definition: macro.hpp:129
‍mutable namespace
Definition: Backend.hpp:10
LinearModel CostModel
Definition: CostModel.hpp:16
The catalog contains all Databases and keeps track of all meta information of the database system.
Definition: Catalog.hpp:215
static Catalog & Get()
Return a reference to the single Catalog instance.
CostModel load_cost_model(const char *csv_path)
Load a cost model for an operator without transformations from a file.
std::pair< RowMatrixXd, VectorXd > load_csv(const char *csv_path)
Matrix< double, Dynamic, Dynamic, RowMajor > RowMatrixXd
Parses csv file and returns a pair of matrices.
void usage(std::ostream &out, const char *name)
#define ADD(TYPE, VAR, INIT, SHORT, LONG, DESCR, CALLBACK)
CostModel load_filter_cost_model(const char *csv_path, unsigned degree=9)
Load a cost model for the filter operator from a file.