mutable
A Database System for Research and Fast Prototyping
Loading...
Searching...
No Matches
cardinality_gen.cpp
Go to the documentation of this file.
1#include <cstddef>
2#include <cstdlib>
3#include <cstring>
4#include <deque>
5#include <filesystem>
6#include <fstream>
7#include <iostream>
8#include <memory>
10#include <mutable/mutable.hpp>
11#include <mutable/Options.hpp>
13#include <mutable/util/fn.hpp>
14#include <random>
15#include <stdexcept>
16#include <type_traits>
17#include <unordered_map>
18#include <unordered_set>
19#ifdef __BMI2__
20#include <x86intrin.h>
21#endif
22
23
25
26
30template<typename RealType>
32{
33 using real_type = RealType;
34 static_assert(std::is_floating_point_v<real_type>, "RealType must be a floating-point type");
35
36 private:
38 std::uniform_real_distribution<real_type> d_;
39
40 public:
42 : alpha_(alpha), d_(0, 1)
43 {
44 if (alpha_ <= 0)
45 throw std::invalid_argument("alpha must be positive");
46 }
47
48 real_type alpha() const { return alpha_; }
49
50 template<typename Generator>
51 real_type operator()(Generator &&g) { return std::pow(d_(std::forward<Generator>(g)), alpha_); }
52};
53
54struct
55{
59 unsigned seed;
63 std::size_t min_cardinality;
65 std::size_t max_cardinality;
67 double alpha;
69
70using table_type = std::unordered_map<Subproblem, uint64_t, m::SubproblemHash>;
71
72template<typename Generator>
73void generate_correlated_cardinalities(table_type &table, const m::QueryGraph &G, Generator &&g);
74
75template<typename Generator>
76void generate_uncorrelated_cardinalities(table_type &table, const m::QueryGraph &G, Generator &&g);
77
78void emit_cardinalities(std::ostream &out, const m::QueryGraph &G, const table_type &table);
79
80void usage(std::ostream &out, const char *name)
81{
82 out << "A tool to generate fake cardinalities for queries.\n"
83 << "USAGE:\n\t" << name << " <SCHEMA.sql> [<QUERY.sql>]"
84 << std::endl;
85}
86
87int main(int argc, const char **argv)
88{
90
91 /*----- Parse command line arguments. ----------------------------------------------------------------------------*/
92 m::ArgParser &AP = C.arg_parser();
93#define ADD(TYPE, VAR, INIT, SHORT, LONG, DESCR, CALLBACK)\
94 VAR = INIT;\
95 {\
96 AP.add<TYPE>(SHORT, LONG, DESCR, CALLBACK);\
97 }
98 /*----- Help message ---------------------------------------------------------------------------------------------*/
99 ADD(bool, args.show_help, false, /* Type, Var, Init */
100 "-h", "--help", /* Short, Long */
101 "prints this help message", /* Description */
102 [&](bool) { args.show_help = true; }); /* Callback */
103 /*----- Seed -----------------------------------------------------------------------------------------------------*/
104 ADD(unsigned, args.seed, 42, /* Type, Var, Init */
105 nullptr, "--seed", /* Short, Long */
106 "the seed for the PRNG", /* Description */
107 [&](unsigned s) { args.seed = s; }); /* Callback */
108 /*----- Correlated selectivity -----------------------------------------------------------------------------------*/
109 ADD(bool, args.correlated_selectivities, true, /* Type, Var, Init */
110 nullptr, "--uncorrelated", /* Short, Long */
111 "make join selectivities uncorrelated", /* Description */
112 [&](bool) { args.correlated_selectivities = false; }); /* Callback */
113 /*----- Cardinalities --------------------------------------------------------------------------------------------*/
114 ADD(std::size_t, args.min_cardinality, 10, /* Type, Var, Init */
115 nullptr, "--min", /* Short, Long */
116 "the minimum cardinality of base tables", /* Description */
117 [&](std::size_t card) { args.min_cardinality = card; }); /* Callback */
118 ADD(std::size_t, args.max_cardinality, 1e4, /* Type, Var, Init */
119 nullptr, "--max", /* Short, Long */
120 "the maximum cardinality of base tables", /* Description */
121 [&](std::size_t card) { args.max_cardinality = card; }); /* Callback */
122 ADD(int, args.alpha, 3, /* Type, Var, Init */
123 nullptr, "--alpha", /* Short, Long */
124 "skewedness of cardinalities and selectivities, from ℤ", /* Description */
125 [&](int alpha) { /* Callback */
126 if (alpha == 0)
127 args.alpha = 1;
128 else if (alpha > 0)
129 args.alpha = alpha;
130 else
131 args.alpha = 1./-alpha;
132 });
133 /*----- Parse command line arguments. ----------------------------------------------------------------------------*/
134 AP.parse_args(argc, argv);
135
136 /*----- Help message. -----*/
137 if (args.show_help) {
138 usage(std::cout, argv[0]);
139 std::cout << "WHERE\n" << AP;
140 std::exit(EXIT_SUCCESS);
141 }
142
143 /*----- Validate command line arguments. -------------------------------------------------------------------------*/
144 if (AP.args().size() == 0 or AP.args().size() > 2) {
145 usage(std::cout, argv[0]);
146 std::exit(EXIT_FAILURE);
147 }
148
149 /*----- Configure mutable. ---------------------------------------------------------------------------------------*/
150 m::Options::Get().quiet = true;
151
152 /*----- Load schema. ---------------------------------------------------------------------------------------------*/
153 m::Diagnostic diag(false, std::cout, std::cerr);
154 std::filesystem::path path_to_schema(AP.args()[0]);
155 m::execute_file(diag, path_to_schema);
156 if (not C.has_database_in_use()) {
157 std::cerr << "No database selected.\n";
158 std::exit(EXIT_FAILURE);
159 }
160
161 /*----- Read input from stdin or file. ---------------------------------------------------------------------------*/
162 const std::string input = [&AP]() -> std::string {
163 if (AP.args().size() == 1) {
164 return std::string(std::istreambuf_iterator<char>(std::cin), {});
165 } else {
166 std::filesystem::path path(AP.args()[1]);
167 errno = 0;
168 std::ifstream in(path);
169 if (not in) {
170 std::cerr << "Could not open file '" << path << '\'';
171 const auto errsv = errno;
172 if (errsv)
173 std::cerr << ": " << strerror(errsv);
174 std::cerr << std::endl;
175 std::exit(EXIT_FAILURE);
176 }
177 return std::string(std::istreambuf_iterator<char>(in), {});
178 }
179 }();
180
181 /*----- Parse input. ---------------------------------------------------------------------------------------------*/
182 const std::unique_ptr<m::ast::SelectStmt> select = [&diag, &input]() -> std::unique_ptr<m::ast::SelectStmt> {
183 auto stmt = m::statement_from_string(diag, input);
184 if (not m::is<m::ast::SelectStmt>(stmt)) {
185 std::cerr << "Expected a SELECT statement.\n";
186 std::exit(EXIT_FAILURE);
187 }
188 return m::as<m::ast::SelectStmt>(std::move(stmt));
189 }();
190
191 /*----- Prepare graph, table, and generator. ---------------------------------------------------------------------*/
192 auto G = m::QueryGraph::Build(*select);
193 table_type table(G->num_sources() * G->num_sources());
194 std::mt19937_64 g(args.seed ^ 0x1d9a07cfbc6e4464UL);
195
196 /*----- Fill table with cardinalities for base relations. --------------------------------------------------------*/
197 const double delta = args.max_cardinality - args.min_cardinality;
199 for (unsigned i = 0; i != G->num_sources(); ++i) {
200 const std::size_t cardinality = args.min_cardinality + delta * dist(g);
201 M_insist(args.min_cardinality <= cardinality);
202 M_insist(cardinality <= args.max_cardinality);
203 table[Subproblem::Singleton(i)] = cardinality;
204 }
205
206 /*----- Generate cardinalities. ----------------------------------------------------------------------------------*/
207 if (args.correlated_selectivities)
209 else
211
212 /*----- Emit the table. ------------------------------------------------------------------------------------------*/
213 emit_cardinalities(std::cout, *G, table);
214
216}
217
218template<typename Generator>
219void generate_correlated_cardinalities(table_type &table, const m::QueryGraph &G, Generator &&g)
220{
221 const Subproblem All = Subproblem::All(G.num_sources());
223 std::unordered_map<Subproblem, double, m::SubproblemHash> max_cardinalities;
224
226 auto cardinality = [&](const Subproblem S) -> double {
227 auto table_it = table.find(S);
228 if (table_it != table.end()) [[likely]] {
229 // We already know the cardinality of S.
230 M_insist(table_it->second <= args.max_cardinality * args.max_cardinality);
231 return table_it->second;
232 } else {
233 /* Randomly roll the cardinality for S, but within the bounds of `args.min_cardinality`,
234 * `args.max_cardinality`^2, and the computed maximum cardinality. */
235 const auto it = max_cardinalities.find(S);
236 M_insist(it != max_cardinalities.end());
237 M_insist(it->second > args.min_cardinality);
238 skewed_distribution<double> selectivity_dist(args.alpha);
239 const double max_cardinality = std::min<double>(it->second, args.max_cardinality * args.max_cardinality);
240 M_insist(max_cardinality >= args.min_cardinality);
241 const double c = args.min_cardinality + (max_cardinality - args.min_cardinality) * selectivity_dist(g);
242 M_insist(c != std::numeric_limits<double>::infinity());
243 M_insist(c <= max_cardinality, "cardinality out of bounds");
244 M_insist(c <= it->second, "inconsistent cardinality");
245 max_cardinalities.erase(it);
246 table_it = table.emplace_hint(table_it, S, c);
247 M_insist(table_it->second <= c);
248 return c;
249 }
250 };
251
252 auto update = [&](const Subproblem S1, const Subproblem S2) -> void {
253 const double cardinality_S1 = cardinality(S1);
254 const double cardinality_S2 = cardinality(S2);
255
256 M_insist(table.count(S1|S2) == 0, "must not have a cardinality for S1|S2 yet");
257 /* Update the maximum cardinality of S1 υ S2. */
258 if (auto it = max_cardinalities.find(S1|S2); it == max_cardinalities.end())
259 max_cardinalities.emplace_hint(it, S1|S2, cardinality_S1 * cardinality_S2);
260 else
261 it->second = std::min(it->second, cardinality_S1 * cardinality_S2);
262 };
263
264 M.for_each_CSG_pair_undirected(All, update);
265 M_insist(max_cardinalities.size() == 1);
266 table[All] = cardinality(All);
267}
268
269template<typename Generator>
271{
272 const Subproblem All = Subproblem::All(G.num_sources());
274 skewed_distribution<double> selectivity_dist(args.alpha);
275 // std::uniform_real_distribution<double> selectivity_dist(0, 1);
276 const std::size_t delta = args.max_cardinality - args.min_cardinality;
277
278 /* Roll result cardinality. */
279 const std::size_t cardinality_of_result = args.min_cardinality + delta * selectivity_dist(g);
280
281 /*----- Compute combined selectivity. -----*/
282 const double combined_selectivity = [&]() -> double {
283 double cardinality_of_Cartesian_product = 1;
284 for (auto it = All.begin(); it != All.end(); ++it)
285 cardinality_of_Cartesian_product *= table[it.as_set()];
286 const double x = cardinality_of_result / cardinality_of_Cartesian_product;
287 return x;
288 }();
289
290 const double avg_selectivity = std::pow(combined_selectivity, 1. / G.num_joins());
291
292 /*----- Generate selectivities for joins. -----*/
293 std::vector<double> selectivities(G.num_joins());
294 double remaining_selectivity = combined_selectivity;
295 for (std::size_t j = 1; j != G.num_joins(); ++j) {
296 auto &J = G.joins()[j]->sources();
297
298 /*----- Create a fresh PRNG from the sources that are being joined. -----*/
299 std::size_t seed = 0;
300 for (auto s : J)
301 seed = (seed * 526122883134911UL) ^ m::StrHash{}(*s.get().name());
302 std::mt19937_64 local_g(seed);
303
304 /*----- Compute the selectivity of this join. -----*/
305 double cartesian = 1;
306 for (auto s : J)
307 cartesian *= table[Subproblem::Singleton(s.get().id())];
308 const double min_selectivity = std::max(args.min_cardinality / cartesian, remaining_selectivity);
309 if (min_selectivity < avg_selectivity) {
310 /*----- Draw selectivity between min and average. -----*/
311 M_insist(min_selectivity <= avg_selectivity);
312 selectivities[j] = avg_selectivity - (avg_selectivity - min_selectivity) * selectivity_dist(local_g);
313 } else {
314 /*----- Draw selectivity between average and 1. -----*/
315 selectivities[j] = avg_selectivity + (1. - avg_selectivity) * selectivity_dist(local_g);
316 }
317 remaining_selectivity /= selectivities[j];
318 }
319 selectivities[0] = remaining_selectivity;
320
321
322 auto update = [&G, &M, &selectivities, &table](const Subproblem left, const Subproblem right) {
323 M_insist(M.is_connected(left, right));
324
325 /* Compute total selectivity as product of selectivities of all edges between `left` and `right`. */
326 double real_cardinality = table[left] * table[right];
327 for (std::size_t j = 0; j != G.num_joins(); ++j) {
328 auto &sources = G.joins()[j]->sources();
329 if (sources.size() != 2)
330 throw std::invalid_argument("unsupported join");
331 /* Check whether this join connects `left` and `right`. */
332 if ((left[sources[0].get().id()] and right[sources[1].get().id()]) or
333 (left[sources[1].get().id()] and right[sources[0].get().id()]))
334 {
335 /* This join connects `left` and `right`. */
336 real_cardinality *= selectivities[j];
337 }
338 }
339
340 /* Compute cardinality of CSG. */
341 table[left | right] = std::clamp<double>(real_cardinality, 1UL, args.max_cardinality * args.max_cardinality);
342 };
343
344 M.for_each_CSG_pair_undirected(All, update);
345}
346
347void emit_cardinalities(std::ostream &out, const m::QueryGraph &G, const table_type &table)
348{
351
352 std::vector<std::pair<Subproblem, double>> cardinalities{table.cbegin(), table.cend()};
353 std::sort(cardinalities.begin(), cardinalities.end(), [](const auto &first, const auto &second) {
354 return uint64_t(first.first) < uint64_t(second.first);
355 });
356
357 out << "{\n \"" << DB.name << "\": [\n";
358 bool first = true;
359 for (auto [S, C] : cardinalities) {
360 /*----- Emit relations. -----*/
361 if (first) first = false;
362 else out << ",\n";
363 out << " { \"relations\": [";
364 for (auto it = S.begin(); it != S.end(); ++it) {
365 if (it != S.begin()) out << ", ";
366 auto &DS = G.sources()[*it];
367 out << '"' << DS->name() << '"';
368 }
369 /*----- Emit size. -----*/
370 out << "], \"size\": " << std::size_t(C) << "}";
371 }
372 out << "\n ]\n}\n";
373}
and(sizeof(T)==4) U64x1 reinterpret_to_U64(m
Definition: WasmAlgo.cpp:266
int main(void)
std::size_t max_cardinality
‍maximum cardinality of relations and intermediate results
double alpha
‍alpha for skewed distribution
void generate_correlated_cardinalities(table_type &table, const m::QueryGraph &G, Generator &&g)
void emit_cardinalities(std::ostream &out, const m::QueryGraph &G, const table_type &table)
bool correlated_selectivities
‍whether selectivities are correlated
std::size_t min_cardinality
‍minimum cardinality of relations and intermediate results
void usage(std::ostream &out, const char *name)
std::unordered_map< Subproblem, uint64_t, m::SubproblemHash > table_type
#define ADD(TYPE, VAR, INIT, SHORT, LONG, DESCR, CALLBACK)
unsigned seed
‍the seed for the PRNG
void generate_uncorrelated_cardinalities(table_type &table, const m::QueryGraph &G, Generator &&g)
bool show_help
‍whether to show a help message
struct @5 args
A parser for command line arguments.
Definition: ArgParser.hpp:20
void parse_args(int argc, const char **argv)
Parses the arguments from argv.
Definition: ArgParser.cpp:186
const std::vector< const char * > & args() const
Returns all positional arguments.
Definition: ArgParser.hpp:143
#define M_insist(...)
Definition: macro.hpp:129
void M_EXPORT execute_file(Diagnostic &diag, const std::filesystem::path &path)
Execute the SQL file at path.
Definition: mutable.cpp:410
std::unique_ptr< ast::Stmt > M_EXPORT statement_from_string(Diagnostic &diag, const std::string &str)
Use lexer, parser, and semantic analysis to create a Stmt from str.
Definition: mutable.cpp:25
An adjacency matrix for a given query graph.
void for_each_CSG_pair_undirected(SmallBitset super, std::function< void(SmallBitset, SmallBitset)> callback) const
Enumerate all pairs of connected subgraphs (CSGs) that are connected by at least one edge.
bool is_connected(SmallBitset S) const
Returns true iff the subproblem S is connected.
The catalog contains all Databases and keeps track of all meta information of the database system.
Definition: Catalog.hpp:215
Database & get_database_in_use()
Returns a reference to the Database that is currently in use, if any.
Definition: Catalog.hpp:295
bool has_database_in_use() const
Returns true if any Database is currently in use.
Definition: Catalog.hpp:293
static Catalog & Get()
Return a reference to the single Catalog instance.
static void Destroy()
Destroys the current Catalog instance.
m::ArgParser & arg_parser()
Definition: Catalog.hpp:253
A Database is a set of Tables, Functions, and Statistics.
Definition: Schema.hpp:870
ThreadSafePooledString name
the name of the database
Definition: Schema.hpp:892
static Options & Get()
Return a reference to the single Options instance.
Definition: Options.cpp:9
bool quiet
Definition: Options.hpp:35
The query graph represents all data sources and joins in a graph structure.
Definition: QueryGraph.hpp:172
std::size_t num_joins() const
Returns the number of Joins in this graph.
Definition: QueryGraph.hpp:227
const auto & joins() const
Definition: QueryGraph.hpp:258
std::size_t num_sources() const
Returns the number of DataSources in this graph.
Definition: QueryGraph.hpp:224
const auto & sources() const
Definition: QueryGraph.hpp:257
static std::unique_ptr< QueryGraph > Build(const ast::Stmt &stmt)
AdjacencyMatrix & adjacency_matrix()
Definition: QueryGraph.hpp:277
Implements a small and efficient set over integers in the range of 0 to 63 (including).
Definition: ADT.hpp:26
static SmallBitset All(std::size_t n)
Factory method for creating a SmallBitset with first n bits set.
Definition: ADT.hpp:117
auto end() const
Definition: ADT.hpp:175
static SmallBitset Singleton(std::size_t n)
Factory method for creating a Singleton Smallbitset with n -th bit set.
Definition: ADT.hpp:125
auto begin() const
Definition: ADT.hpp:173
Computes the FNV-1a 64-bit hash of a cstring.
Definition: fn.hpp:71
A distribution of values in range [0; 1] that are skewed towards 0.
real_type alpha() const
std::uniform_real_distribution< real_type > d_
real_type operator()(Generator &&g)
skewed_distribution(real_type alpha)