17#include <unordered_map>
18#include <unordered_set>
30template<
typename RealType>
34 static_assert(std::is_floating_point_v<real_type>,
"RealType must be a floating-point type");
38 std::uniform_real_distribution<real_type>
d_;
45 throw std::invalid_argument(
"alpha must be positive");
50 template<
typename Generator>
70using table_type = std::unordered_map<Subproblem, uint64_t, m::SubproblemHash>;
72template<
typename Generator>
75template<
typename Generator>
80void usage(std::ostream &out,
const char *name)
82 out <<
"A tool to generate fake cardinalities for queries.\n"
83 <<
"USAGE:\n\t" << name <<
" <SCHEMA.sql> [<QUERY.sql>]"
87int main(
int argc,
const char **argv)
93#define ADD(TYPE, VAR, INIT, SHORT, LONG, DESCR, CALLBACK)\
96 AP.add<TYPE>(SHORT, LONG, DESCR, CALLBACK);\
99 ADD(
bool,
args.show_help,
false,
101 "prints this help message",
102 [&](
bool) { args.show_help = true; });
106 "the seed for the PRNG",
107 [&](
unsigned s) { args.seed = s; });
109 ADD(
bool,
args.correlated_selectivities,
true,
110 nullptr,
"--uncorrelated",
111 "make join selectivities uncorrelated",
112 [&](
bool) { args.correlated_selectivities = false; });
114 ADD(std::size_t,
args.min_cardinality, 10,
116 "the minimum cardinality of base tables",
117 [&](std::size_t card) { args.min_cardinality = card; });
118 ADD(std::size_t,
args.max_cardinality, 1e4,
120 "the maximum cardinality of base tables",
121 [&](std::size_t card) { args.max_cardinality = card; });
124 "skewedness of cardinalities and selectivities, from ℤ",
131 args.alpha = 1./-alpha;
137 if (
args.show_help) {
138 usage(std::cout, argv[0]);
139 std::cout <<
"WHERE\n" << AP;
140 std::exit(EXIT_SUCCESS);
144 if (AP.
args().size() == 0 or AP.
args().size() > 2) {
145 usage(std::cout, argv[0]);
146 std::exit(EXIT_FAILURE);
154 std::filesystem::path path_to_schema(AP.
args()[0]);
157 std::cerr <<
"No database selected.\n";
158 std::exit(EXIT_FAILURE);
162 const std::string input = [&AP]() -> std::string {
163 if (AP.
args().size() == 1) {
164 return std::string(std::istreambuf_iterator<char>(std::cin), {});
166 std::filesystem::path path(AP.
args()[1]);
168 std::ifstream in(path);
170 std::cerr <<
"Could not open file '" << path <<
'\'';
171 const auto errsv = errno;
173 std::cerr <<
": " << strerror(errsv);
174 std::cerr << std::endl;
175 std::exit(EXIT_FAILURE);
177 return std::string(std::istreambuf_iterator<char>(in), {});
182 const std::unique_ptr<m::ast::SelectStmt> select = [&diag, &input]() -> std::unique_ptr<m::ast::SelectStmt> {
184 if (not m::is<m::ast::SelectStmt>(stmt)) {
185 std::cerr <<
"Expected a SELECT statement.\n";
186 std::exit(EXIT_FAILURE);
188 return m::as<m::ast::SelectStmt>(std::move(stmt));
193 table_type table(G->num_sources() * G->num_sources());
194 std::mt19937_64 g(
args.seed ^ 0x1d9a07cfbc6e4464UL);
197 const double delta =
args.max_cardinality -
args.min_cardinality;
199 for (
unsigned i = 0; i != G->num_sources(); ++i) {
200 const std::size_t cardinality =
args.min_cardinality + delta * dist(g);
207 if (
args.correlated_selectivities)
218template<
typename Generator>
223 std::unordered_map<Subproblem, double, m::SubproblemHash> max_cardinalities;
226 auto cardinality = [&](
const Subproblem S) ->
double {
227 auto table_it = table.find(S);
228 if (table_it != table.end()) [[likely]] {
231 return table_it->second;
235 const auto it = max_cardinalities.find(S);
236 M_insist(it != max_cardinalities.end());
242 M_insist(c != std::numeric_limits<double>::infinity());
244 M_insist(c <= it->second,
"inconsistent cardinality");
245 max_cardinalities.erase(it);
246 table_it = table.emplace_hint(table_it, S, c);
253 const double cardinality_S1 = cardinality(S1);
254 const double cardinality_S2 = cardinality(S2);
256 M_insist(table.count(S1|S2) == 0,
"must not have a cardinality for S1|S2 yet");
258 if (
auto it = max_cardinalities.find(S1|S2); it == max_cardinalities.end())
259 max_cardinalities.emplace_hint(it, S1|S2, cardinality_S1 * cardinality_S2);
261 it->second = std::min(it->second, cardinality_S1 * cardinality_S2);
265 M_insist(max_cardinalities.size() == 1);
266 table[All] = cardinality(All);
269template<
typename Generator>
276 const std::size_t delta =
args.max_cardinality -
args.min_cardinality;
279 const std::size_t cardinality_of_result =
args.min_cardinality + delta * selectivity_dist(g);
282 const double combined_selectivity = [&]() ->
double {
283 double cardinality_of_Cartesian_product = 1;
284 for (
auto it = All.
begin(); it != All.
end(); ++it)
285 cardinality_of_Cartesian_product *= table[it.as_set()];
286 const double x = cardinality_of_result / cardinality_of_Cartesian_product;
290 const double avg_selectivity = std::pow(combined_selectivity, 1. / G.
num_joins());
293 std::vector<double> selectivities(G.
num_joins());
294 double remaining_selectivity = combined_selectivity;
295 for (std::size_t j = 1; j != G.
num_joins(); ++j) {
296 auto &J = G.
joins()[j]->sources();
299 std::size_t
seed = 0;
302 std::mt19937_64 local_g(
seed);
305 double cartesian = 1;
308 const double min_selectivity = std::max(
args.min_cardinality / cartesian, remaining_selectivity);
309 if (min_selectivity < avg_selectivity) {
311 M_insist(min_selectivity <= avg_selectivity);
312 selectivities[j] = avg_selectivity - (avg_selectivity - min_selectivity) * selectivity_dist(local_g);
315 selectivities[j] = avg_selectivity + (1. - avg_selectivity) * selectivity_dist(local_g);
317 remaining_selectivity /= selectivities[j];
319 selectivities[0] = remaining_selectivity;
326 double real_cardinality = table[left] * table[right];
327 for (std::size_t j = 0; j != G.
num_joins(); ++j) {
328 auto &sources = G.
joins()[j]->sources();
329 if (sources.size() != 2)
330 throw std::invalid_argument(
"unsupported join");
332 if ((left[sources[0].get().
id()]
and right[sources[1].get().
id()]) or
333 (left[sources[1].get().
id()]
and right[sources[0].get().
id()]))
336 real_cardinality *= selectivities[j];
341 table[left | right] = std::clamp<double>(real_cardinality, 1UL,
args.max_cardinality *
args.max_cardinality);
352 std::vector<std::pair<Subproblem, double>> cardinalities{table.cbegin(), table.cend()};
353 std::sort(cardinalities.begin(), cardinalities.end(), [](
const auto &first,
const auto &second) {
354 return uint64_t(first.first) < uint64_t(second.first);
357 out <<
"{\n \"" << DB.
name <<
"\": [\n";
359 for (
auto [S, C] : cardinalities) {
361 if (first) first =
false;
363 out <<
" { \"relations\": [";
364 for (
auto it = S.begin(); it != S.end(); ++it) {
365 if (it != S.begin()) out <<
", ";
367 out <<
'"' << DS->name() <<
'"';
370 out <<
"], \"size\": " << std::size_t(C) <<
"}";
and(sizeof(T)==4) U64x1 reinterpret_to_U64(m
std::size_t max_cardinality
maximum cardinality of relations and intermediate results
double alpha
alpha for skewed distribution
void generate_correlated_cardinalities(table_type &table, const m::QueryGraph &G, Generator &&g)
void emit_cardinalities(std::ostream &out, const m::QueryGraph &G, const table_type &table)
bool correlated_selectivities
whether selectivities are correlated
std::size_t min_cardinality
minimum cardinality of relations and intermediate results
void usage(std::ostream &out, const char *name)
std::unordered_map< Subproblem, uint64_t, m::SubproblemHash > table_type
#define ADD(TYPE, VAR, INIT, SHORT, LONG, DESCR, CALLBACK)
unsigned seed
the seed for the PRNG
void generate_uncorrelated_cardinalities(table_type &table, const m::QueryGraph &G, Generator &&g)
bool show_help
whether to show a help message
A parser for command line arguments.
void parse_args(int argc, const char **argv)
Parses the arguments from argv.
const std::vector< const char * > & args() const
Returns all positional arguments.
void M_EXPORT execute_file(Diagnostic &diag, const std::filesystem::path &path)
Execute the SQL file at path.
std::unique_ptr< ast::Stmt > M_EXPORT statement_from_string(Diagnostic &diag, const std::string &str)
Use lexer, parser, and semantic analysis to create a Stmt from str.
An adjacency matrix for a given query graph.
void for_each_CSG_pair_undirected(SmallBitset super, std::function< void(SmallBitset, SmallBitset)> callback) const
Enumerate all pairs of connected subgraphs (CSGs) that are connected by at least one edge.
bool is_connected(SmallBitset S) const
Returns true iff the subproblem S is connected.
The catalog contains all Databases and keeps track of all meta information of the database system.
Database & get_database_in_use()
Returns a reference to the Database that is currently in use, if any.
bool has_database_in_use() const
Returns true if any Database is currently in use.
static Catalog & Get()
Return a reference to the single Catalog instance.
static void Destroy()
Destroys the current Catalog instance.
m::ArgParser & arg_parser()
A Database is a set of Tables, Functions, and Statistics.
ThreadSafePooledString name
the name of the database
static Options & Get()
Return a reference to the single Options instance.
The query graph represents all data sources and joins in a graph structure.
std::size_t num_joins() const
Returns the number of Joins in this graph.
const auto & joins() const
std::size_t num_sources() const
Returns the number of DataSources in this graph.
const auto & sources() const
static std::unique_ptr< QueryGraph > Build(const ast::Stmt &stmt)
AdjacencyMatrix & adjacency_matrix()
Implements a small and efficient set over integers in the range of 0 to 63 (including).
static SmallBitset All(std::size_t n)
Factory method for creating a SmallBitset with first n bits set.
static SmallBitset Singleton(std::size_t n)
Factory method for creating a Singleton Smallbitset with n -th bit set.
Computes the FNV-1a 64-bit hash of a cstring.
A distribution of values in range [0; 1] that are skewed towards 0.
std::uniform_real_distribution< real_type > d_
real_type operator()(Generator &&g)
skewed_distribution(real_type alpha)