13 std::vector<Spn::LeafType> leaf_types)
16 auto &db = C.get_database(name_of_database);
17 auto &table = db.get_table(name_of_table);
19 leaf_types.resize(table.num_attrs(),
Spn::AUTO);
22 auto old_estimator = db.cardinality_estimator(C.create_cardinality_estimator(C.pool(
"CartesianProduct"), db.name));
24 std::size_t num_columns = table.num_attrs();
25 std::size_t
num_rows = table.store().num_rows();
29 auto primary_key = table.primary_key();
30 std::vector<std::size_t> primary_key_id;
31 for (
auto &elem : primary_key) {
32 primary_key_id.push_back(elem.get().id);
35 MatrixXf data(
num_rows, num_columns - primary_key_id.size());
36 MatrixXi null_matrix = MatrixXi::Zero(data.rows(), data.cols());
37 std::unordered_map<ThreadSafePooledString, unsigned> attribute_to_id;
39 std::size_t primary_key_count = 0;
41 const std::string table_name = *table.name();
43 std::unique_ptr<ast::SelectStmt> select_stmt(
dynamic_cast<ast::SelectStmt*
>(stmt.release()));
46 for (std::size_t current_column = 0; current_column < num_columns; current_column++) {
47 auto lower_bound = std::lower_bound(primary_key_id.begin(), primary_key_id.end(), current_column);
53 auto attribute = table.schema()[current_column].id.name;
54 attribute_to_id.emplace(attribute, current_column - primary_key_count);
56 auto &type = table.at(current_column).type;
57 std::size_t current_row = 0;
59 if (type->is_float()) {
60 if (leaf_types[current_column - primary_key_count] ==
Spn::AUTO) {
63 auto callback_data = std::make_unique<CallbackOperator>([&](
const Schema &S,
const Tuple &
T) {
64 if (
T.is_null(current_column)) {
65 null_matrix(current_row, current_column - primary_key_count) = 1;
66 data(current_row, current_column - primary_key_count) = 0;
68 data(current_row, current_column - primary_key_count) = T.get(current_column).as_f();
75 if (type->is_double()) {
76 if (leaf_types[current_column - primary_key_count] ==
Spn::AUTO) {
79 auto callback_data = std::make_unique<CallbackOperator>([&](
const Schema &S,
const Tuple &
T) {
80 if (
T.is_null(current_column)) {
81 null_matrix(current_row, current_column - primary_key_count) = 1;
82 data(current_row, current_column - primary_key_count) = 0;
84 data(current_row, current_column - primary_key_count) = float(T.get(current_column).as_d());
91 if (type->is_integral()) {
92 if (leaf_types[current_column - primary_key_count] ==
Spn::AUTO) {
93 leaf_types[current_column - primary_key_count] =
Spn::DISCRETE;
95 auto callback_data = std::make_unique<CallbackOperator>([&](
const Schema &S,
const Tuple &
T) {
96 if (
T.is_null(current_column)) {
97 null_matrix(current_row, current_column - primary_key_count) = 1;
98 data(current_row, current_column - primary_key_count) = 0;
100 data(current_row, current_column - primary_key_count) = float(T.get(current_column).as_i());
107 if (type->is_character_sequence()) {
108 if (leaf_types[current_column - primary_key_count] ==
Spn::AUTO) {
111 auto callback_data = std::make_unique<CallbackOperator>([&](
const Schema &S,
const Tuple &
T) {
112 if (
T.is_null(current_column)) {
113 null_matrix(current_row, current_column - primary_key_count) = 1;
114 data(current_row, current_column - primary_key_count) = 0;
116 auto v_pointer = T.get(current_column).as_p();
117 const char* value = static_cast<const char*>(v_pointer);
118 data(current_row, current_column - primary_key_count) = float(std::hash<const char*>{}(value));
127 db.cardinality_estimator(std::move(old_estimator));
132std::unordered_map<ThreadSafePooledString, SpnWrapper*>
137 auto &db = C.get_database(name_of_database);
139 std::unordered_map<ThreadSafePooledString, SpnWrapper*> spns;
141 for (
auto table_it = db.begin_tables(); table_it != db.end_tables(); table_it++) {
145 learn_spn_table(name_of_database, table_it->first, std::move(leaf_types[table_it->first]))
ThreadSafeStringPool::proxy_type ThreadSafePooledString
std::unique_ptr< ast::Stmt > M_EXPORT statement_from_string(Diagnostic &diag, const std::string &str)
Use lexer, parser, and semantic analysis to create a Stmt from str.
void M_EXPORT execute_query(Diagnostic &diag, const ast::SelectStmt &stmt, std::unique_ptr< Consumer > consumer)
Optimizes and executes the given SelectStmt.
static Catalog & Get()
Return a reference to the single Catalog instance.
A Schema represents a sequence of identifiers, optionally with a prefix, and their associated types.
A wrapper class for an Spn to be used in the context of databases.
std::size_t num_rows() const
returns the number of rows in the SPN.
static std::unordered_map< ThreadSafePooledString, SpnWrapper * > learn_spn_database(const ThreadSafePooledString &name_of_database, std::unordered_map< ThreadSafePooledString, std::vector< Spn::LeafType > > leaf_types=decltype(leaf_types)())
Learn SPNs over the tables in the given database.
float lower_bound(const AttrFilter &attr_filter) const
Compute the lower bound probability for continuous domains.
static SpnWrapper learn_spn_table(const ThreadSafePooledString &name_of_database, const ThreadSafePooledString &name_of_table, std::vector< Spn::LeafType > leaf_types=decltype(leaf_types)())
Learn an SPN over the given table.
static Spn learn_spn(Eigen::MatrixXf &data, Eigen::MatrixXi &null_matrix, std::vector< LeafType > &leaf_types)
Learn an SPN over the given data.