-
Notifications
You must be signed in to change notification settings - Fork 72
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement set based aod verifier, support aod mining in fastod #468
base: main
Are you sure you want to change the base?
Changes from all commits
4267c23
31f3c47
1e76762
20592c8
4d65979
f043b22
b452c28
789c617
0214e5d
eef5077
7cff09d
cae91d2
187a778
1f901a6
cdb4980
8b24e2f
d938c38
fa7af04
7d04aea
60c2927
df33951
4c350d0
56a0f31
a0e7af5
3f6ec49
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import desbordante | ||
import pandas as pd | ||
from tabulate import tabulate | ||
import textwrap | ||
|
||
def prints(str): | ||
print(textwrap.fill(str, 80)) | ||
|
||
def print_data_frame(data_frame, title = None): | ||
print_table(data_frame, 'keys', title) | ||
|
||
def print_table(table, headers = None, title = None): | ||
if title is not None: | ||
print(title) | ||
|
||
print(tabulate(table, headers=headers, tablefmt='psql')) | ||
|
||
|
||
table = pd.read_csv('examples/datasets/salary.csv') | ||
algo = desbordante.aod_verification.algorithms.Default() | ||
algo.load_data(table=table) | ||
|
||
prints("This example verifies set-based ODs.") | ||
prints("""Please take a look at set-based ODs mining example first | ||
(examples/basic/mining_set_od1.py).""") | ||
print() | ||
print_data_frame(table) | ||
print() | ||
prints("Let's start by verifying exact OC holding on the table above.") | ||
prints("""One example of such OC is `{1} : 2<= ~ 3<=` (if you don't understand why it | ||
holds, please take a look at examples/basic/mining_set_od1.py).""") | ||
print() | ||
|
||
# Indices are zero-based, this is why we're subtracting one | ||
algo.execute(oc_context=[0], oc_left_index=1, oc_right_index=2, left_ordering='ascending') | ||
prints(f"""OC {{1}}: 2<= ~ 3<= holds exactly: {algo.holds()}, removal set: {algo.get_removal_set()}, | ||
error: {algo.get_error()}""") | ||
prints("""Note that error is zero and removal set is empty. Removal set is a set of rows which | ||
should be removed in order for OC (or OD) to holds exactly. In this case OC holds exactly and | ||
that's why the set is empty.""") | ||
|
||
print() | ||
|
||
prints("Now let's verify OFD {2} : [] -> 1<= which also holds exactly.") | ||
print() | ||
algo.execute(ofd_context=[1], ofd_right_index=0) | ||
prints(f"""OFD {{2}}: [] -> 1<= holds exactly: {algo.holds()}, removal set: {algo.get_removal_set()}, | ||
error: {algo.get_error()}""") | ||
prints("Note once again that error is zero and removal set is empty because OFD holds exactly") | ||
|
||
print() | ||
print("Now let's add some lines to the table to break exact holding of dependencies.") | ||
table.loc[8] = [2020, 50, 9000] | ||
print_data_frame(table) | ||
|
||
# Need to recreate algo object since currently calling load_data() twice is not supported yet | ||
algo = desbordante.aod_verification.algorithms.Default() | ||
algo.load_data(table=table) | ||
algo.execute(oc_context=[0], oc_left_index=1, oc_right_index=2, left_ordering='ascending') | ||
prints(f"""OC {{1}}: 2<= ~ 3<= holds exactly: {algo.holds()}, removal set: {algo.get_removal_set()}, | ||
error: {algo.get_error()}""") | ||
prints("""Note that now OC doesn't hold exactly and that removal set is {4}. This means that | ||
in order for OC to hold exactly, it's enough to remove from the table line number 4 (indexed from 0). | ||
Note that lines 8 and 4 are interchangable in that sense, because the problem with ordering is | ||
caused by their simultaneous presence in the table and removing any of them will fix it. Algorithm | ||
guarantees to return a minimal removal set in terms of size, but doesn't specify which one exactly | ||
if there are several possible.""") | ||
|
||
print() | ||
algo.execute(ofd_context=[1], ofd_right_index=0) | ||
prints(f"""OFD {{2}}: [] -> 1<= holds exactly: {algo.holds()}, removal set: {algo.get_removal_set()}, | ||
error: {algo.get_error()}""") | ||
prints("""Note once again that the OFD does not hold exactly anymore and that removal set is not | ||
empty. By adding line 8 with the same value in column 2 as in line 5, but different values in column | ||
1 we broke FD 2->1 and thus broke OFD {2}: [] -> 1<=. Removing any of these two lines will make the | ||
OFD hold exactly, thus removal set is {5}. | ||
""") | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,5 +1,6 @@ | ||||||||||||||
#include "algorithms/algorithm.h" | ||||||||||||||
|
||||||||||||||
#include <algorithm> | ||||||||||||||
#include <cassert> | ||||||||||||||
|
||||||||||||||
#include "config/exceptions.h" | ||||||||||||||
|
@@ -11,6 +12,10 @@ bool Algorithm::SetExternalOption([[maybe_unused]] std::string_view option_name, | |||||||||||||
return false; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
bool Algorithm::ExternalOptionIsRequired([[maybe_unused]] std::string_view option_name) const { | ||||||||||||||
return false; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
void Algorithm::AddSpecificNeededOptions( | ||||||||||||||
[[maybe_unused]] std::unordered_set<std::string_view>& previous_options) const {} | ||||||||||||||
|
||||||||||||||
|
@@ -60,8 +65,15 @@ void Algorithm::MakeOptionsAvailable(std::vector<std::string_view> const& option | |||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
bool Algorithm::AllRequiredOptionsAreSet() const noexcept { | ||||||||||||||
std::unordered_set<std::string_view> needed = GetNeededOptions(); | ||||||||||||||
return std::none_of(needed.begin(), needed.end(), [this](std::string_view option_name) { | ||||||||||||||
return possible_options_.at(option_name)->IsRequired(); | ||||||||||||||
}); | ||||||||||||||
} | ||||||||||||||
Comment on lines
+68
to
+73
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe the following code will be the same, but when I look at the current implementaion I don't like that Shouldn't we do |
||||||||||||||
|
||||||||||||||
void Algorithm::LoadData() { | ||||||||||||||
if (!GetNeededOptions().empty()) | ||||||||||||||
if (!AllRequiredOptionsAreSet()) | ||||||||||||||
throw std::logic_error("All options need to be set before starting processing."); | ||||||||||||||
LoadDataInternal(); | ||||||||||||||
ExecutePrepare(); | ||||||||||||||
|
@@ -71,7 +83,7 @@ unsigned long long Algorithm::Execute() { | |||||||||||||
if (!data_loaded_) { | ||||||||||||||
throw std::logic_error("Data must be processed before execution."); | ||||||||||||||
} | ||||||||||||||
if (!GetNeededOptions().empty()) | ||||||||||||||
if (!AllRequiredOptionsAreSet()) | ||||||||||||||
throw std::logic_error("All options need to be set before execution."); | ||||||||||||||
progress_.ResetProgress(); | ||||||||||||||
ResetState(); | ||||||||||||||
|
@@ -112,10 +124,23 @@ void Algorithm::SetOption(std::string_view option_name, boost::any const& value) | |||||||||||||
child_opts.insert(child_opts.end(), new_opts.begin(), new_opts.end()); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
bool Algorithm::OptionIsRequired(std::string_view option_name) const { | ||||||||||||||
if (bool ext_opt_is_required = ExternalOptionIsRequired(option_name); ext_opt_is_required) { | ||||||||||||||
return true; | ||||||||||||||
} | ||||||||||||||
Comment on lines
+128
to
+130
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not just
Suggested change
|
||||||||||||||
|
||||||||||||||
auto it = possible_options_.find(option_name); | ||||||||||||||
if (it == possible_options_.end()) { | ||||||||||||||
return false; | ||||||||||||||
} | ||||||||||||||
return it->second->IsRequired(); | ||||||||||||||
Comment on lines
+133
to
+136
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe just
Suggested change
|
||||||||||||||
} | ||||||||||||||
|
||||||||||||||
std::unordered_set<std::string_view> Algorithm::GetNeededOptions() const { | ||||||||||||||
std::unordered_set<std::string_view> needed{}; | ||||||||||||||
for (std::string_view name : available_options_) { | ||||||||||||||
if (!possible_options_.at(name)->IsSet()) { | ||||||||||||||
if (std::unique_ptr<config::IOption> const& opt = possible_options_.at(name); | ||||||||||||||
!opt->IsSet() && opt->IsRequired()) { | ||||||||||||||
needed.insert(name); | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.