-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DRAFT] DynFD algorithm (2/4 search strategies) #462
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,16 @@ | ||
cmake-build-debug/ | ||
build/ | ||
Build/ | ||
BUILD/ | ||
bUILD/ | ||
Datasets/ | ||
.sconsign.dblite | ||
lib/ | ||
**/.DS_Store | ||
**/myeasylog.log | ||
.cache/ | ||
.csv | ||
.idea/ | ||
.sconsign.dblite | ||
.vscode/ | ||
**/myeasylog.log | ||
BUILD/ | ||
Build/ | ||
Datasets/ | ||
bUILD/ | ||
build/ | ||
cmake-build-debug/ | ||
dist | ||
lib/ | ||
venv |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,237 @@ | ||
#include "dynfd.h" | ||
|
||
#include <easylogging++.h> | ||
|
||
#include "algo_factory.h" | ||
#include "algorithms/fd/hycommon/all_column_combinations.h" | ||
#include "algorithms/fd/hycommon/preprocessor.h" | ||
#include "algorithms/fd/hycommon/util/pli_util.h" | ||
#include "algorithms/fd/hyfd/inductor.h" | ||
#include "algorithms/fd/hyfd/sampler.h" | ||
#include "algorithms/fd/hyfd/validator.h" | ||
#include "algorithms/fd/raw_fd.h" | ||
#include "indices/option.h" | ||
#include "option_using.h" | ||
#include "tabular_data/crud_operations/delete/option.h" | ||
#include "tabular_data/crud_operations/insert/option.h" | ||
#include "tabular_data/crud_operations/operations.h" | ||
#include "tabular_data/crud_operations/update/option.h" | ||
#include "tabular_data/input_table/option.h" | ||
|
||
namespace algos::dynfd { | ||
|
||
void DynFD::ExecuteHyFD() { | ||
std::shared_ptr hy_fd_relation = ColumnLayoutRelationData::CreateFrom(*input_table_, true); | ||
|
||
auto [plis, pli_records, og_mapping] = hy::Preprocess(hy_fd_relation.get()); | ||
auto plis_shared = std::make_shared<hy::PLIs>(std::move(plis)); | ||
auto const pli_records_shared = std::make_shared<hy::Rows>(std::move(pli_records)); | ||
|
||
hyfd::Sampler sampler(plis_shared, pli_records_shared); | ||
|
||
auto positive_cover_tree = std::make_shared<model::FDTree>(hy_fd_relation->GetNumColumns()); | ||
hyfd::Inductor inductor(positive_cover_tree); | ||
hyfd::Validator validator(positive_cover_tree, plis_shared, pli_records_shared); | ||
|
||
hy::IdPairs comparison_suggestions; | ||
|
||
while (true) { | ||
auto non_fds = sampler.GetNonFDs(comparison_suggestions); | ||
|
||
inductor.UpdateFdTree(std::move(non_fds)); | ||
|
||
comparison_suggestions = validator.ValidateAndExtendCandidates(); | ||
|
||
if (comparison_suggestions.empty()) { | ||
break; | ||
} | ||
|
||
LOG(TRACE) << "Cycle done"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. мб это хочется до прерывания цикла писать? |
||
} | ||
|
||
for (size_t rhs = 0; rhs < hy_fd_relation->GetNumColumns(); ++rhs) { | ||
positive_cover_tree_->Remove(boost::dynamic_bitset(hy_fd_relation->GetNumColumns()), rhs); | ||
} | ||
for (auto fd : positive_cover_tree->FillFDs()) { | ||
fd.lhs_ = hy::RestoreAgreeSet(fd.lhs_, og_mapping, hy_fd_relation->GetNumColumns()); | ||
fd.rhs_ = og_mapping[fd.rhs_]; | ||
positive_cover_tree_->AddFD(fd.lhs_, fd.rhs_); | ||
} | ||
} | ||
|
||
unsigned long long DynFD::ExecuteInternal() { | ||
auto const start_time = std::chrono::system_clock::now(); | ||
|
||
bool const is_non_fd_validation_needed = | ||
(!delete_statement_indices_.empty()) || (update_statements_table_ != nullptr); | ||
bool const is_fd_validation_needed = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. по хорошему переместить на 80 строчку |
||
(update_statements_table_ != nullptr) || (insert_statements_table_ != nullptr); | ||
|
||
if (!delete_statement_indices_.empty()) { | ||
relation_->DeleteBatch(delete_statement_indices_); | ||
} | ||
if (update_statements_table_ != nullptr) { | ||
relation_->DeleteRecordsFromUpdateBatch(update_statements_table_); | ||
} | ||
|
||
if (is_non_fd_validation_needed) { | ||
validator_->ValidateNonFds(); | ||
} | ||
|
||
size_t const first_insert_batch_id = relation_->GetNextRecordId(); | ||
if (update_statements_table_ != nullptr) { | ||
relation_->InsertRecordsFromUpdateBatch(update_statements_table_); | ||
} | ||
if (insert_statements_table_ != nullptr) { | ||
relation_->InsertBatch(insert_statements_table_); | ||
} | ||
|
||
if (is_fd_validation_needed) { | ||
validator_->ValidateFds(first_insert_batch_id); | ||
} | ||
|
||
SetProgress(kTotalProgressPercent); | ||
RegisterFDs(positive_cover_tree_->FillFDs()); | ||
auto const elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>( | ||
std::chrono::system_clock::now() - start_time); | ||
return elapsed_milliseconds.count(); | ||
} | ||
|
||
void DynFD::LoadDataInternal() { | ||
relation_ = DynamicRelationData::CreateFrom(input_table_); | ||
if (relation_->GetColumnData().empty()) { | ||
throw std::runtime_error( | ||
"Got an empty dataset: FD mining is meaningless. If you want to specify columns, " | ||
"insert their names"); | ||
} | ||
positive_cover_tree_ = std::make_shared<model::FDTree>(GetRelation().GetNumColumns()); | ||
|
||
if (!relation_->Empty()) { | ||
ExecuteHyFD(); | ||
} | ||
|
||
negative_cover_tree_ = std::make_shared<NonFDTree>(GetRelation().GetNumColumns()); | ||
|
||
// Cover inversion | ||
for (size_t i = 0; i < relation_->GetNumColumns(); i++) { | ||
boost::dynamic_bitset<> lhs(relation_->GetNumColumns()); | ||
lhs.set(); | ||
lhs.reset(i); | ||
negative_cover_tree_->AddNonFD(lhs, i, std::nullopt); | ||
} | ||
|
||
for (auto&& [lhs, rhs] : positive_cover_tree_->FillFDs()) { | ||
std::vector<boost::dynamic_bitset<>> violated = negative_cover_tree_->GetSpecials(lhs, rhs); | ||
for (auto&& non_fd : violated) { | ||
negative_cover_tree_->Remove(non_fd, rhs); | ||
for (size_t bit = lhs.find_first(); bit != boost::dynamic_bitset<>::npos; | ||
bit = lhs.find_next(bit)) { | ||
boost::dynamic_bitset<> new_lhs = non_fd; | ||
new_lhs.reset(bit); | ||
if (!negative_cover_tree_->ContainsNonFdOrSpecial(new_lhs, rhs)) { | ||
negative_cover_tree_->AddNonFD(new_lhs, rhs, std::nullopt); | ||
} | ||
} | ||
} | ||
} | ||
|
||
validator_ = std::make_shared<Validator>(positive_cover_tree_, negative_cover_tree_, relation_); | ||
} | ||
|
||
void DynFD::MakeExecuteOptsAvailableFDInternal() { | ||
using namespace config::names; | ||
MakeOptionsAvailable(kCrudOptions); | ||
} | ||
|
||
void DynFD::RegisterOptions() { | ||
DESBORDANTE_OPTION_USING; | ||
|
||
auto check_inserts = [this](config::InputTable const& insert_batch) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. если это те же функции, что и у меня в DynFDVerifier, то мб вынесем их в условный tabular_data/crud_operations/validate_operations.h? |
||
if (insert_batch == nullptr || !insert_batch->HasNextRow()) { | ||
return; | ||
} | ||
if (insert_batch->GetNumberOfColumns() != input_table_->GetNumberOfColumns()) { | ||
throw config::ConfigurationError( | ||
"Schema mismatch: insert statements must have the same number of columns as " | ||
"the input table"); | ||
} | ||
for (size_t i = 0; i < input_table_->GetNumberOfColumns(); ++i) { | ||
if (insert_batch->GetColumnName(i) != input_table_->GetColumnName(i)) { | ||
throw config::ConfigurationError( | ||
"Schema mismatch: insert statements' column names must match the input " | ||
"table"); | ||
} | ||
} | ||
}; | ||
|
||
auto check_deletes = [this](std::unordered_set<size_t> const& delete_batch) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
if (delete_batch.empty()) { | ||
return; | ||
} | ||
for (size_t const id : delete_batch) { | ||
if (!relation_->IsRowIndexValid(id)) { | ||
throw config::ConfigurationError("Attempt to delete a non-existing row"); | ||
} | ||
} | ||
}; | ||
|
||
auto check_updates = [this](config::InputTable const& update_batch) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
if (update_batch == nullptr || !update_batch->HasNextRow()) { | ||
return; | ||
} | ||
if (update_batch->GetNumberOfColumns() != input_table_->GetNumberOfColumns() + 1) { | ||
throw config::ConfigurationError( | ||
"Schema mismatch: update statements must have the number of columns one more " | ||
"than the input table"); | ||
} | ||
for (size_t i = 0; i < input_table_->GetNumberOfColumns(); ++i) { | ||
if (update_batch->GetColumnName(i + 1) != input_table_->GetColumnName(i)) { | ||
throw config::ConfigurationError( | ||
"Schema mismatch: update statements column names, except of first one, " | ||
"must match the input table"); | ||
} | ||
} | ||
std::unordered_set<size_t> rows_to_update; | ||
while (update_batch->HasNextRow()) { | ||
auto row = update_batch->GetNextRow(); | ||
size_t id = std::stoull(row.front()); | ||
if (!relation_->IsRowIndexValid(id)) { | ||
throw config::ConfigurationError("Attempt to update a non-existing row"); | ||
} | ||
if (rows_to_update.contains(id)) { | ||
throw config::ConfigurationError("Update statements have duplicates"); | ||
} | ||
rows_to_update.emplace(id); | ||
} | ||
update_batch->Reset(); | ||
}; | ||
|
||
RegisterOption(config::kTableOpt(&input_table_)); | ||
RegisterOption( | ||
config::kInsertStatementsOpt(&insert_statements_table_).SetValueCheck(check_inserts)); | ||
RegisterOption( | ||
config::kDeleteStatementsOpt(&delete_statement_indices_).SetValueCheck(check_deletes)); | ||
RegisterOption( | ||
config::kUpdateStatementsOpt(&update_statements_table_).SetValueCheck(check_updates)); | ||
} | ||
|
||
void DynFD::RegisterFDs(std::vector<RawFD>&& fds) { | ||
auto const* const schema = GetRelation().GetSchema(); | ||
for (auto&& [lhs, rhs] : fds) { | ||
Vertical lhs_v(schema, lhs); | ||
Column rhs_c(schema, schema->GetColumn(rhs)->GetName(), rhs); | ||
RegisterFd(std::move(lhs_v), std::move(rhs_c)); | ||
} | ||
} | ||
|
||
DynFD::DynFD() : FDAlgorithm({kDefaultPhaseName}) { | ||
RegisterOptions(); | ||
MakeOptionsAvailable({config::kTableOpt.GetName()}); | ||
} | ||
|
||
DynamicRelationData const& DynFD::GetRelation() const { | ||
assert(relation_ != nullptr); | ||
return *relation_; | ||
} | ||
|
||
} // namespace algos::dynfd |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#pragma once | ||
#include <FDTrees/fd_tree.h> | ||
#include <algorithm.h> | ||
#include <fd/fd.h> | ||
#include <fd/fd_algorithm.h> | ||
#include <tabular_data/input_table_type.h> | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. все инклюды на строчках 2-6 по идее нужно прописывать вот так а ещё algorithm.h, fd.h, d/hycommon/types.h тут не нужны, зато нужно точно прописать |
||
#include "fd/hycommon/types.h" | ||
#include "model/dynamic_relation_data.h" | ||
#include "model/non_fd_tree.h" | ||
#include "validator.h" | ||
|
||
namespace algos::dynfd { | ||
class DynFD final : public FDAlgorithm { | ||
config::InputTable input_table_; | ||
config::InputTable insert_statements_table_ = nullptr; | ||
config::InputTable update_statements_table_ = nullptr; | ||
std::unordered_set<size_t> delete_statement_indices_; | ||
std::shared_ptr<DynamicRelationData> relation_ = nullptr; | ||
std::shared_ptr<model::FDTree> positive_cover_tree_ = nullptr; | ||
std::shared_ptr<NonFDTree> negative_cover_tree_ = nullptr; | ||
std::shared_ptr<Validator> validator_ = nullptr; | ||
|
||
public: | ||
DynFD(); | ||
[[nodiscard]] DynamicRelationData const& GetRelation() const; | ||
|
||
private: | ||
void RegisterOptions(); | ||
void LoadDataInternal() override; | ||
void MakeExecuteOptsAvailableFDInternal() override; | ||
unsigned long long ExecuteInternal() override; | ||
void RegisterFDs(std::vector<RawFD>&& fds); | ||
void ExecuteHyFD(); | ||
|
||
void ResetStateFd() override {} | ||
}; | ||
|
||
} // namespace algos::dynfd |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#pragma once | ||
|
||
#include <memory> | ||
|
||
#include "dynamic_position_list_index.h" | ||
#include "model/table/abstract_column_data.h" | ||
|
||
namespace algos::dynfd { | ||
|
||
class CompressedColumnData : model::AbstractColumnData { | ||
std::shared_ptr<DynamicPositionListIndex> position_list_index_; | ||
|
||
public: | ||
CompressedColumnData(Column const* column, | ||
std::unique_ptr<DynamicPositionListIndex> position_list_index) | ||
: AbstractColumnData(column), position_list_index_(std::move(position_list_index)) {} | ||
|
||
[[nodiscard]] size_t GetNumRows() const { | ||
return position_list_index_->GetSize(); | ||
} | ||
|
||
[[nodiscard]] std::string ToString() const final { | ||
return "Data for " + column_->ToString(); | ||
} | ||
|
||
[[nodiscard]] std::shared_ptr<DynamicPositionListIndex> GetPositionListIndex() const { | ||
return position_list_index_; | ||
} | ||
}; | ||
|
||
} // namespace algos::dynfd |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do { ... } while(...) цикл выглядит опрятнее